licensee 9.9.3 → 9.12.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (119) hide show
  1. checksums.yaml +4 -4
  2. data/bin/licensee +1 -0
  3. data/lib/licensee.rb +4 -2
  4. data/lib/licensee/commands/detect.rb +9 -4
  5. data/lib/licensee/commands/diff.rb +7 -8
  6. data/lib/licensee/commands/license_path.rb +2 -0
  7. data/lib/licensee/commands/version.rb +2 -0
  8. data/lib/licensee/content_helper.rb +188 -83
  9. data/lib/licensee/hash_helper.rb +2 -0
  10. data/lib/licensee/license.rb +18 -7
  11. data/lib/licensee/license_field.rb +8 -1
  12. data/lib/licensee/license_meta.rb +3 -0
  13. data/lib/licensee/license_rules.rb +2 -0
  14. data/lib/licensee/matchers.rb +2 -0
  15. data/lib/licensee/matchers/cabal.rb +16 -2
  16. data/lib/licensee/matchers/cargo.rb +3 -1
  17. data/lib/licensee/matchers/copyright.rb +4 -2
  18. data/lib/licensee/matchers/cran.rb +7 -3
  19. data/lib/licensee/matchers/dice.rb +10 -2
  20. data/lib/licensee/matchers/dist_zilla.rb +3 -1
  21. data/lib/licensee/matchers/exact.rb +3 -0
  22. data/lib/licensee/matchers/gemspec.rb +8 -5
  23. data/lib/licensee/matchers/matcher.rb +3 -1
  24. data/lib/licensee/matchers/npm_bower.rb +3 -1
  25. data/lib/licensee/matchers/package.rb +3 -0
  26. data/lib/licensee/matchers/reference.rb +3 -1
  27. data/lib/licensee/matchers/spdx.rb +3 -1
  28. data/lib/licensee/project_files.rb +2 -0
  29. data/lib/licensee/project_files/license_file.rb +13 -10
  30. data/lib/licensee/project_files/package_manager_file.rb +3 -0
  31. data/lib/licensee/project_files/project_file.rb +12 -4
  32. data/lib/licensee/project_files/readme_file.rb +7 -5
  33. data/lib/licensee/projects.rb +2 -0
  34. data/lib/licensee/projects/fs_project.rb +3 -0
  35. data/lib/licensee/projects/git_project.rb +16 -8
  36. data/lib/licensee/projects/github_project.rb +29 -9
  37. data/lib/licensee/projects/project.rb +13 -2
  38. data/lib/licensee/rule.rb +2 -0
  39. data/lib/licensee/version.rb +3 -1
  40. data/spec/bin_spec.rb +2 -0
  41. data/spec/fixture_spec.rb +46 -0
  42. data/spec/fixtures/detect.json +8 -6
  43. data/spec/fixtures/fixtures.yml +110 -0
  44. data/spec/fixtures/html/license.html +262 -0
  45. data/spec/fixtures/license-hashes.json +39 -0
  46. data/spec/fixtures/mit-optional/LICENSE.txt +21 -0
  47. data/spec/integration_spec.rb +20 -0
  48. data/spec/licensee/commands/detect_spec.rb +6 -2
  49. data/spec/licensee/commands/license_path_spec.rb +2 -0
  50. data/spec/licensee/commands/version_spec.rb +2 -0
  51. data/spec/licensee/content_helper_spec.rb +152 -36
  52. data/spec/licensee/hash_helper_spec.rb +2 -0
  53. data/spec/licensee/license_field_spec.rb +7 -0
  54. data/spec/licensee/license_meta_spec.rb +2 -0
  55. data/spec/licensee/license_rules_spec.rb +2 -0
  56. data/spec/licensee/license_spec.rb +36 -11
  57. data/spec/licensee/matchers/cabal_matcher_spec.rb +93 -0
  58. data/spec/licensee/matchers/cargo_matcher_spec.rb +2 -0
  59. data/spec/licensee/matchers/copyright_matcher_spec.rb +4 -2
  60. data/spec/licensee/matchers/cran_matcher_spec.rb +2 -0
  61. data/spec/licensee/matchers/dice_matcher_spec.rb +4 -2
  62. data/spec/licensee/matchers/dist_zilla_matcher_spec.rb +2 -0
  63. data/spec/licensee/matchers/exact_matcher_spec.rb +2 -0
  64. data/spec/licensee/matchers/gemspec_matcher_spec.rb +2 -0
  65. data/spec/licensee/matchers/matcher_spec.rb +2 -0
  66. data/spec/licensee/matchers/npm_bower_matcher_spec.rb +2 -0
  67. data/spec/licensee/matchers/package_matcher_spec.rb +2 -0
  68. data/spec/licensee/matchers/reference_matcher_spec.rb +2 -0
  69. data/spec/licensee/matchers/spdx_matcher_spec.rb +2 -0
  70. data/spec/licensee/project_files/license_file_spec.rb +4 -2
  71. data/spec/licensee/project_files/package_info_spec.rb +2 -0
  72. data/spec/licensee/project_files/project_file_spec.rb +3 -0
  73. data/spec/licensee/project_files/readme_file_spec.rb +11 -0
  74. data/spec/licensee/project_spec.rb +23 -3
  75. data/spec/licensee/projects/git_project_spec.rb +23 -0
  76. data/spec/licensee/projects/github_project_spec.rb +2 -0
  77. data/spec/licensee/rule_spec.rb +2 -0
  78. data/spec/licensee_spec.rb +3 -1
  79. data/spec/spec_helper.rb +29 -9
  80. data/spec/vendored_license_spec.rb +27 -8
  81. data/vendor/choosealicense.com/_data/meta.yml +0 -4
  82. data/vendor/choosealicense.com/_licenses/0bsd.txt +39 -0
  83. data/vendor/choosealicense.com/_licenses/afl-3.0.txt +7 -6
  84. data/vendor/choosealicense.com/_licenses/agpl-3.0.txt +0 -1
  85. data/vendor/choosealicense.com/_licenses/apache-2.0.txt +0 -1
  86. data/vendor/choosealicense.com/_licenses/artistic-2.0.txt +0 -1
  87. data/vendor/choosealicense.com/_licenses/bsd-2-clause.txt +8 -6
  88. data/vendor/choosealicense.com/_licenses/bsd-3-clause-clear.txt +1 -2
  89. data/vendor/choosealicense.com/_licenses/bsd-3-clause.txt +12 -10
  90. data/vendor/choosealicense.com/_licenses/bsl-1.0.txt +0 -1
  91. data/vendor/choosealicense.com/_licenses/cc-by-4.0.txt +0 -1
  92. data/vendor/choosealicense.com/_licenses/cc-by-sa-4.0.txt +0 -1
  93. data/vendor/choosealicense.com/_licenses/cc0-1.0.txt +0 -1
  94. data/vendor/choosealicense.com/_licenses/cecill-2.1.txt +579 -0
  95. data/vendor/choosealicense.com/_licenses/ecl-2.0.txt +0 -1
  96. data/vendor/choosealicense.com/_licenses/epl-1.0.txt +1 -2
  97. data/vendor/choosealicense.com/_licenses/epl-2.0.txt +1 -2
  98. data/vendor/choosealicense.com/_licenses/eupl-1.1.txt +0 -1
  99. data/vendor/choosealicense.com/_licenses/eupl-1.2.txt +0 -1
  100. data/vendor/choosealicense.com/_licenses/gpl-2.0.txt +0 -1
  101. data/vendor/choosealicense.com/_licenses/gpl-3.0.txt +1 -2
  102. data/vendor/choosealicense.com/_licenses/isc.txt +0 -1
  103. data/vendor/choosealicense.com/_licenses/lgpl-2.1.txt +0 -1
  104. data/vendor/choosealicense.com/_licenses/lgpl-3.0.txt +1 -2
  105. data/vendor/choosealicense.com/_licenses/lppl-1.3c.txt +0 -1
  106. data/vendor/choosealicense.com/_licenses/mit.txt +0 -1
  107. data/vendor/choosealicense.com/_licenses/mpl-2.0.txt +0 -1
  108. data/vendor/choosealicense.com/_licenses/ms-pl.txt +0 -1
  109. data/vendor/choosealicense.com/_licenses/ms-rl.txt +0 -1
  110. data/vendor/choosealicense.com/_licenses/ncsa.txt +0 -1
  111. data/vendor/choosealicense.com/_licenses/odbl-1.0.txt +573 -0
  112. data/vendor/choosealicense.com/_licenses/ofl-1.1.txt +0 -1
  113. data/vendor/choosealicense.com/_licenses/osl-3.0.txt +1 -2
  114. data/vendor/choosealicense.com/_licenses/postgresql.txt +2 -3
  115. data/vendor/choosealicense.com/_licenses/unlicense.txt +1 -2
  116. data/vendor/choosealicense.com/_licenses/upl-1.0.txt +3 -4
  117. data/vendor/choosealicense.com/_licenses/wtfpl.txt +0 -1
  118. data/vendor/choosealicense.com/_licenses/zlib.txt +0 -1
  119. metadata +41 -19
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9d245d544a683bfaff9448e8654a409cea3b6ddc808ba7f7b966c84e031255ea
4
- data.tar.gz: 801c8048260b692571828fce99aa754afeb99063849e3e330e7fbb0b1b7117db
3
+ metadata.gz: d7dc009b0467cfb305e8dac051ed4e78d2f35d0454f2e14cef0952338540f8ae
4
+ data.tar.gz: 3c27bb3dd3cea6d62fab826b81fab93d9152893851b541c91d69406cdf9fcbd8
5
5
  SHA512:
6
- metadata.gz: 918b6abcf12b00722fb85c64112f2e11334a4b43d6f3b7d3335156a6bb21b0077432030f6c132aa0950af0b4860102a9a1c2d8548a5de5e1529852ae13e29ed1
7
- data.tar.gz: 919b0c1a4390a2dd964e7f4d245194ef47e108e1336fc4f04b9795a97f17475ebc64dce12ff1bbd97bb805f5d1fc95774ece1878af17a89f4d57cf974582944b
6
+ metadata.gz: 07f19b33f70b0b73611d34e474f2aa4e4d7f62c7451cdf70f76774beceac2c75ab3d1cc5048061a848b979a54032aad6dd1ba278c79cd798029efd6873d54425
7
+ data.tar.gz: 96c5e66f65307e7feb2c00b3f06661b093c60995d049f7fd19cc27b76881965a1d33768a16b0a3a3b085e9392ef828dce9cf692ee04255dd9ea2c6d22da38da6
@@ -1,4 +1,5 @@
1
1
  #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'dotenv/load'
4
5
  require 'thor'
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require_relative 'licensee/version'
2
4
  require 'forwardable'
3
5
  require 'pathname'
@@ -19,7 +21,7 @@ module Licensee
19
21
  CONFIDENCE_THRESHOLD = 98
20
22
 
21
23
  # Base domain from which to build license URLs
22
- DOMAIN = 'http://choosealicense.com'.freeze
24
+ DOMAIN = 'http://choosealicense.com'
23
25
 
24
26
  class << self
25
27
  attr_writer :confidence_threshold
@@ -49,7 +51,7 @@ module Licensee
49
51
  end
50
52
 
51
53
  # Inverse of the confidence threshold, represented as a float
52
- # By default this will be 0.05
54
+ # By default this will be 0.02
53
55
  def inverse_confidence_threshold
54
56
  @inverse_confidence_threshold ||=
55
57
  (1 - Licensee.confidence_threshold / 100.0).round(2)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class LicenseeCLI < Thor
2
4
  # Methods to call when displaying information about ProjectFiles
3
5
  MATCHED_FILE_METHODS = %i[
@@ -40,8 +42,10 @@ class LicenseeCLI < Thor
40
42
 
41
43
  MATCHED_FILE_METHODS.each do |method|
42
44
  next unless matched_file.respond_to? method
45
+
43
46
  value = matched_file.public_send method
44
47
  next if value.nil?
48
+
45
49
  rows << [humanize(method, :method), humanize(value, method)]
46
50
  end
47
51
  print_table rows, indent: 2
@@ -49,8 +53,9 @@ class LicenseeCLI < Thor
49
53
  next unless matched_file.is_a? Licensee::ProjectFiles::LicenseFile
50
54
  next if matched_file.confidence == 100
51
55
 
52
- licenses = licenses_by_similiarity(matched_file)
56
+ licenses = licenses_by_similarity(matched_file)
53
57
  next if licenses.empty?
58
+
54
59
  say ' Closest non-matching licenses:'
55
60
  rows = licenses[0...3].map do |license, similarity|
56
61
  spdx_id = license.meta['spdx-id']
@@ -89,15 +94,15 @@ class LicenseeCLI < Thor
89
94
  end
90
95
  end
91
96
 
92
- def licenses_by_similiarity(matched_file)
97
+ def licenses_by_similarity(matched_file)
93
98
  matcher = Licensee::Matchers::Dice.new(matched_file)
94
99
  potential_licenses = Licensee.licenses(hidden: true).select(&:wordset)
95
100
  matcher.instance_variable_set('@potential_licenses', potential_licenses)
96
- matcher.licenses_by_similiarity
101
+ matcher.licenses_by_similarity
97
102
  end
98
103
 
99
104
  def closest_license_key(matched_file)
100
- licenses = licenses_by_similiarity(matched_file)
105
+ licenses = licenses_by_similarity(matched_file)
101
106
  licenses.first.first.key unless licenses.empty?
102
107
  end
103
108
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'tmpdir'
2
4
 
3
5
  class LicenseeCLI < Thor
@@ -39,26 +41,23 @@ class LicenseeCLI < Thor
39
41
 
40
42
  def license_to_diff
41
43
  return options[:license_to_diff] if options[:license_to_diff]
42
- return project.license_file if remote?
44
+ return project.license_file if remote? || STDIN.tty? && project.license_file
43
45
 
44
46
  @license_to_diff ||= begin
45
- if STDIN.tty?
46
- error 'You must pipe license contents to the command via STDIN'
47
- exit 1
48
- end
49
-
50
47
  Licensee::ProjectFiles::LicenseFile.new(STDIN.read, 'LICENSE')
51
48
  end
52
49
  end
53
50
 
54
51
  def expected_license
55
- @expected_license ||= Licensee::License.find options[:license] if options[:license]
52
+ if options[:license]
53
+ @expected_license ||= Licensee::License.find options[:license]
54
+ end
56
55
  return @expected_license if @expected_license
57
56
 
58
57
  if options[:license]
59
58
  error "#{options[:license]} is not a valid license"
60
59
  else
61
- error 'You must provide an expected license'
60
+ error 'Usage: provide a license to diff against with --license (spdx name)'
62
61
  end
63
62
 
64
63
  error "Valid licenses: #{Licensee::License.all(hidden: true).map(&:key).join(', ')}"
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class LicenseeCLI < Thor
2
4
  desc 'license-path [PATH]', "Returns the path to the given project's license file"
3
5
  def license_path(_path)
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  class LicenseeCLI < Thor
2
4
  desc 'version', 'Return the Licensee version'
3
5
  def version
@@ -1,31 +1,105 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'set'
2
4
  require 'digest'
3
5
 
4
6
  module Licensee
5
7
  module ContentHelper
6
8
  DIGEST = Digest::SHA1
7
- END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i
8
- HR_REGEX = /[=\-\*][=\-\*\s]{3,}/
9
+ START_REGEX = /\A\s*/.freeze
10
+ END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i.freeze
9
11
  ALT_TITLE_REGEX = License::ALT_TITLE_REGEX
10
- ALL_RIGHTS_RESERVED_REGEX = /\Aall rights reserved\.?$/i
11
- WHITESPACE_REGEX = /\s+/
12
- MARKDOWN_HEADING_REGEX = /\A\s*#+/
13
- VERSION_REGEX = /\Aversion.*$/i
14
- MARKUP_REGEX = /[#_*=~\[\]()`|>]+/
15
- DEVELOPED_BY_REGEX = /\Adeveloped by:.*?\n\n/im
16
- QUOTE_BEGIN_REGEX = /[`'"‘“]/
17
- QUOTE_END_REGEX = /['"’”]/
12
+ REGEXES = {
13
+ hrs: /^\s*[=\-\*]{3,}\s*$/,
14
+ all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i,
15
+ whitespace: /\s+/,
16
+ markdown_headings: /#{START_REGEX}#+/,
17
+ version: /#{START_REGEX}version.*$/i,
18
+ span_markup: /[_*~]+(.*?)[_*~]+/,
19
+ link_markup: /\[(.+?)\]\(.+?\)/,
20
+ block_markup: /^\s*>/,
21
+ border_markup: /^[\*-](.*?)[\*-]$/,
22
+ comment_markup: %r{^\s*?[/\*]{1,2}},
23
+ url: %r{#{START_REGEX}https?://[^ ]+\n},
24
+ bullet: /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[)\.])\s+/i,
25
+ developed_by: /#{START_REGEX}developed by:.*?\n\n/im,
26
+ quote_begin: /[`'"‘“]/,
27
+ quote_end: /[`'"’”]/,
28
+ mit_optional: /\(including the next paragraph\)/i
29
+ }.freeze
30
+ NORMALIZATIONS = {
31
+ lists: { from: /^\s*(?:\d\.|\*)\s+([^\n])/, to: '- \1' },
32
+ https: { from: /http:/, to: 'https:' },
33
+ ampersands: { from: '&', to: 'and' },
34
+ dashes: { from: /(?<!^)([—–-]+)(?!$)/, to: '-' },
35
+ quotes: {
36
+ from: /#{REGEXES[:quote_begin]}+([\w -]*?\w)#{REGEXES[:quote_end]}+/,
37
+ to: '"\1"'
38
+ }
39
+ }.freeze
40
+
41
+ # Legally equivalent words that schould be ignored for comparison
42
+ # See https://spdx.org/spdx-license-list/matching-guidelines
43
+ VARIETAL_WORDS = {
44
+ 'acknowledgment' => 'acknowledgement',
45
+ 'analogue' => 'analog',
46
+ 'analyse' => 'analyze',
47
+ 'artefact' => 'artifact',
48
+ 'authorisation' => 'authorization',
49
+ 'authorised' => 'authorized',
50
+ 'calibre' => 'caliber',
51
+ 'cancelled' => 'canceled',
52
+ 'capitalisations' => 'capitalizations',
53
+ 'catalogue' => 'catalog',
54
+ 'categorise' => 'categorize',
55
+ 'centre' => 'center',
56
+ 'emphasised' => 'emphasized',
57
+ 'favour' => 'favor',
58
+ 'favourite' => 'favorite',
59
+ 'fulfil' => 'fulfill',
60
+ 'fulfilment' => 'fulfillment',
61
+ 'initialise' => 'initialize',
62
+ 'judgment' => 'judgement',
63
+ 'labelling' => 'labeling',
64
+ 'labour' => 'labor',
65
+ 'licence' => 'license',
66
+ 'maximise' => 'maximize',
67
+ 'modelled' => 'modeled',
68
+ 'modelling' => 'modeling',
69
+ 'offence' => 'offense',
70
+ 'optimise' => 'optimize',
71
+ 'organisation' => 'organization',
72
+ 'organise' => 'organize',
73
+ 'practise' => 'practice',
74
+ 'programme' => 'program',
75
+ 'realise' => 'realize',
76
+ 'recognise' => 'recognize',
77
+ 'signalling' => 'signaling',
78
+ 'sub-license' => 'sublicense',
79
+ 'sub license' => 'sublicense',
80
+ 'utilisation' => 'utilization',
81
+ 'whilst' => 'while',
82
+ 'wilful' => 'wilfull',
83
+ 'non-commercial' => 'noncommercial',
84
+ 'cent' => 'percent',
85
+ 'owner' => 'holder'
86
+ }.freeze
87
+ STRIP_METHODS = %i[
88
+ hrs markdown_headings borders title version url copyright
89
+ block_markup span_markup link_markup
90
+ all_rights_reserved developed_by end_of_terms whitespace
91
+ mit_optional
92
+ ].freeze
18
93
 
19
94
  # A set of each word in the license, without duplicates
20
95
  def wordset
21
- @wordset ||= if content_normalized
22
- content_normalized.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
23
- end
96
+ @wordset ||= content_normalized&.scan(/(?:\w(?:'s|(?<=s)')?)+/)&.to_set
24
97
  end
25
98
 
26
99
  # Number of characteres in the normalized content
27
100
  def length
28
101
  return 0 unless content_normalized
102
+
29
103
  content_normalized.length
30
104
  end
31
105
 
@@ -43,8 +117,10 @@ module Licensee
43
117
  # Given another license or project file, calculates the similarity
44
118
  # as a percentage of words in common
45
119
  def similarity(other)
46
- overlap = (wordset & other.wordset).size
47
- total = wordset.size + other.wordset.size
120
+ wordset_fieldless = wordset - LicenseField.keys
121
+ fields_removed = wordset.size - wordset_fieldless.size
122
+ overlap = (wordset_fieldless & other.wordset).size
123
+ total = wordset_fieldless.size + other.wordset.size - fields_removed
48
124
  100.0 * (overlap * 2.0 / total)
49
125
  end
50
126
 
@@ -59,34 +135,21 @@ module Licensee
59
135
  # content with attribution first to detect attribuion in LicenseFile
60
136
  def content_without_title_and_version
61
137
  @content_without_title_and_version ||= begin
62
- string = content.strip
63
- string = strip_markdown_headings(string)
64
- string = strip_hrs(string)
65
- string = strip_title(string) while string =~ ContentHelper.title_regex
66
- strip_version(string).strip
138
+ @_content = nil
139
+ ops = %i[html hrs comments markdown_headings title version]
140
+ ops.each { |op| strip(op) }
141
+ _content
67
142
  end
68
143
  end
69
144
 
70
- # Content without title, version, copyright, whitespace, or insturctions
71
- #
72
- # wrap - Optional width to wrap the content
73
- #
74
- # Returns a string
75
145
  def content_normalized(wrap: nil)
76
- return unless content
77
146
  @content_normalized ||= begin
78
- string = content_without_title_and_version.downcase
79
- while string =~ Matchers::Copyright::REGEX
80
- string = strip_copyright(string)
81
- end
82
- string = strip_all_rights_reserved(string)
83
- string = strip_developed_by(string)
84
- string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
85
- string = normalize_lists(string)
86
- string = normalize_quotes(string)
87
- string = normalize_https(string)
88
- string = strip_markup(string)
89
- strip_whitespace(string)
147
+ @_content = content_without_title_and_version.downcase
148
+
149
+ (NORMALIZATIONS.keys + %i[spelling bullets]).each { |op| normalize(op) }
150
+ STRIP_METHODS.each { |op| strip(op) }
151
+
152
+ _content
90
153
  end
91
154
 
92
155
  if wrap.nil?
@@ -96,14 +159,24 @@ module Licensee
96
159
  end
97
160
  end
98
161
 
162
+ # Backwards compatibalize constants to avoid a breaking change
163
+ def self.const_missing(const)
164
+ key = const.to_s.downcase.gsub('_regex', '').to_sym
165
+ REGEXES[key] || super
166
+ end
167
+
99
168
  # Wrap text to the given line length
100
169
  def self.wrap(text, line_width = 80)
101
170
  return if text.nil?
171
+
102
172
  text = text.clone
173
+ text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" }
103
174
  text.gsub!(/([^\n])\n([^\n])/, '\1 \2')
104
175
 
105
176
  text = text.split("\n").collect do |line|
106
- if line.length > line_width
177
+ if line =~ REGEXES[:hrs]
178
+ line
179
+ elsif line.length > line_width
107
180
  line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
108
181
  else
109
182
  line
@@ -114,82 +187,114 @@ module Licensee
114
187
  end
115
188
 
116
189
  def self.format_percent(float)
117
- "#{format('%.2f', float)}%"
190
+ "#{format('%<float>.2f', float: float)}%"
118
191
  end
119
192
 
120
193
  def self.title_regex
121
- licenses = Licensee::License.all(hidden: true, psuedo: false)
122
- titles = licenses.map(&:title_regex)
123
-
124
- # Title regex must include the version to support matching within
125
- # families, but for sake of normalization, we can be less strict
126
- without_versions = licenses.map do |license|
127
- next if license.title == license.name_without_version
128
- Regexp.new Regexp.escape(license.name_without_version), 'i'
129
- end
130
- titles.concat(without_versions.compact)
194
+ @title_regex ||= begin
195
+ licenses = Licensee::License.all(hidden: true, psuedo: false)
196
+ titles = licenses.map(&:title_regex)
197
+
198
+ # Title regex must include the version to support matching within
199
+ # families, but for sake of normalization, we can be less strict
200
+ without_versions = licenses.map do |license|
201
+ next if license.title == license.name_without_version
202
+
203
+ Regexp.new Regexp.escape(license.name_without_version), 'i'
204
+ end
205
+ titles.concat(without_versions.compact)
131
206
 
132
- /\A\s*\(?(the )?#{Regexp.union titles}.*$/i
207
+ /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
208
+ end
133
209
  end
134
210
 
135
211
  private
136
212
 
137
- def strip_title(string)
138
- strip(string, ContentHelper.title_regex)
213
+ def _content
214
+ @_content ||= content.to_s.dup.strip
139
215
  end
140
216
 
141
- def strip_version(string)
142
- strip(string, VERSION_REGEX)
217
+ def strip(regex_or_sym)
218
+ return unless _content
219
+
220
+ if regex_or_sym.is_a?(Symbol)
221
+ meth = "strip_#{regex_or_sym}"
222
+ return send(meth) if respond_to?(meth, true)
223
+
224
+ unless REGEXES[regex_or_sym]
225
+ raise ArgumentError, "#{regex_or_sym} is an invalid regex reference"
226
+ end
227
+
228
+ regex_or_sym = REGEXES[regex_or_sym]
229
+ end
230
+
231
+ @_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip
143
232
  end
144
233
 
145
- def strip_copyright(string)
146
- strip(string, Matchers::Copyright::REGEX)
234
+ def strip_title
235
+ while _content =~ ContentHelper.title_regex
236
+ strip(ContentHelper.title_regex)
237
+ end
147
238
  end
148
239
 
149
- # Strip HRs from MPL
150
- def strip_hrs(string)
151
- strip(string, HR_REGEX)
240
+ def strip_borders
241
+ normalize(REGEXES[:border_markup], '\1')
152
242
  end
153
243
 
154
- # Strip leading #s from the document
155
- def strip_markdown_headings(string)
156
- strip(string, MARKDOWN_HEADING_REGEX)
244
+ def strip_comments
245
+ lines = _content.split("\n")
246
+ return if lines.count == 1
247
+ return unless lines.all? { |line| line =~ REGEXES[:comment_markup] }
248
+
249
+ strip(:comment_markup)
157
250
  end
158
251
 
159
- def strip_whitespace(string)
160
- strip(string, WHITESPACE_REGEX)
252
+ def strip_copyright
253
+ regex = Matchers::Copyright::REGEX
254
+ strip(regex) while _content =~ regex
161
255
  end
162
256
 
163
- def strip_all_rights_reserved(string)
164
- strip(string, ALL_RIGHTS_RESERVED_REGEX)
257
+ def strip_end_of_terms
258
+ body, _partition, _instructions = _content.partition(END_OF_TERMS_REGEX)
259
+ @_content = body
165
260
  end
166
261
 
167
- def strip_markup(string)
168
- strip(string, MARKUP_REGEX)
262
+ def strip_span_markup
263
+ normalize(REGEXES[:span_markup], '\1')
169
264
  end
170
265
 
171
- def strip_developed_by(string)
172
- strip(string, DEVELOPED_BY_REGEX)
266
+ def strip_link_markup
267
+ normalize(REGEXES[:link_markup], '\1')
173
268
  end
174
269
 
175
- def strip(string, regex)
176
- string.gsub(regex, ' ').squeeze(' ').strip
270
+ def strip_html
271
+ return unless respond_to?(:filename) && filename
272
+ return unless File.extname(filename) =~ /\.html?/i
273
+
274
+ require 'reverse_markdown'
275
+ @_content = ReverseMarkdown.convert(_content, unknown_tags: :bypass)
177
276
  end
178
277
 
179
- # Replace all enclosing quotes with double quotes
180
- # Single versus double quotes don't alter the meaning, and it's easier to
181
- # strip double quotes if we still want to allow possessives
182
- def normalize_quotes(string)
183
- string.gsub(/#{QUOTE_BEGIN_REGEX}+([\w -]*?\w)#{QUOTE_END_REGEX}+/,
184
- '"\1"')
278
+ def normalize(from_or_key, to = nil)
279
+ operation = { from: from_or_key, to: to } if to
280
+ operation ||= NORMALIZATIONS[from_or_key]
281
+
282
+ if operation
283
+ @_content = _content.gsub operation[:from], operation[:to]
284
+ elsif respond_to?("normalize_#{from_or_key}", true)
285
+ send("normalize_#{from_or_key}")
286
+ else
287
+ raise ArgumentError, "#{from_or_key} is an invalid normalization"
288
+ end
185
289
  end
186
290
 
187
- def normalize_https(string)
188
- string.gsub(/http:/, 'https:')
291
+ def normalize_spelling
292
+ normalize(/\b#{Regexp.union(VARIETAL_WORDS.keys)}\b/, VARIETAL_WORDS)
189
293
  end
190
294
 
191
- def normalize_lists(string)
192
- string.gsub(/^\s*(\d\.|\*)/, '-')
295
+ def normalize_bullets
296
+ normalize(REGEXES[:bullet], "\n\n* ")
297
+ normalize(/\)\s+\(/, ')(')
193
298
  end
194
299
  end
195
300
  end