RubyGems - licensee - Versions diffs - 9.9.3 → 9.12.0 - Mend

licensee 9.9.3 → 9.12.0

Files changed (119) hide show

checksums.yaml +4 -4
data/bin/licensee +1 -0
data/lib/licensee.rb +4 -2
data/lib/licensee/commands/detect.rb +9 -4
data/lib/licensee/commands/diff.rb +7 -8
data/lib/licensee/commands/license_path.rb +2 -0
data/lib/licensee/commands/version.rb +2 -0
data/lib/licensee/content_helper.rb +188 -83
data/lib/licensee/hash_helper.rb +2 -0
data/lib/licensee/license.rb +18 -7
data/lib/licensee/license_field.rb +8 -1
data/lib/licensee/license_meta.rb +3 -0
data/lib/licensee/license_rules.rb +2 -0
data/lib/licensee/matchers.rb +2 -0
data/lib/licensee/matchers/cabal.rb +16 -2
data/lib/licensee/matchers/cargo.rb +3 -1
data/lib/licensee/matchers/copyright.rb +4 -2
data/lib/licensee/matchers/cran.rb +7 -3
data/lib/licensee/matchers/dice.rb +10 -2
data/lib/licensee/matchers/dist_zilla.rb +3 -1
data/lib/licensee/matchers/exact.rb +3 -0
data/lib/licensee/matchers/gemspec.rb +8 -5
data/lib/licensee/matchers/matcher.rb +3 -1
data/lib/licensee/matchers/npm_bower.rb +3 -1
data/lib/licensee/matchers/package.rb +3 -0
data/lib/licensee/matchers/reference.rb +3 -1
data/lib/licensee/matchers/spdx.rb +3 -1
data/lib/licensee/project_files.rb +2 -0
data/lib/licensee/project_files/license_file.rb +13 -10
data/lib/licensee/project_files/package_manager_file.rb +3 -0
data/lib/licensee/project_files/project_file.rb +12 -4
data/lib/licensee/project_files/readme_file.rb +7 -5
data/lib/licensee/projects.rb +2 -0
data/lib/licensee/projects/fs_project.rb +3 -0
data/lib/licensee/projects/git_project.rb +16 -8
data/lib/licensee/projects/github_project.rb +29 -9
data/lib/licensee/projects/project.rb +13 -2
data/lib/licensee/rule.rb +2 -0
data/lib/licensee/version.rb +3 -1
data/spec/bin_spec.rb +2 -0
data/spec/fixture_spec.rb +46 -0
data/spec/fixtures/detect.json +8 -6
data/spec/fixtures/fixtures.yml +110 -0
data/spec/fixtures/html/license.html +262 -0
data/spec/fixtures/license-hashes.json +39 -0
data/spec/fixtures/mit-optional/LICENSE.txt +21 -0
data/spec/integration_spec.rb +20 -0
data/spec/licensee/commands/detect_spec.rb +6 -2
data/spec/licensee/commands/license_path_spec.rb +2 -0
data/spec/licensee/commands/version_spec.rb +2 -0
data/spec/licensee/content_helper_spec.rb +152 -36
data/spec/licensee/hash_helper_spec.rb +2 -0
data/spec/licensee/license_field_spec.rb +7 -0
data/spec/licensee/license_meta_spec.rb +2 -0
data/spec/licensee/license_rules_spec.rb +2 -0
data/spec/licensee/license_spec.rb +36 -11
data/spec/licensee/matchers/cabal_matcher_spec.rb +93 -0
data/spec/licensee/matchers/cargo_matcher_spec.rb +2 -0
data/spec/licensee/matchers/copyright_matcher_spec.rb +4 -2
data/spec/licensee/matchers/cran_matcher_spec.rb +2 -0
data/spec/licensee/matchers/dice_matcher_spec.rb +4 -2
data/spec/licensee/matchers/dist_zilla_matcher_spec.rb +2 -0
data/spec/licensee/matchers/exact_matcher_spec.rb +2 -0
data/spec/licensee/matchers/gemspec_matcher_spec.rb +2 -0
data/spec/licensee/matchers/matcher_spec.rb +2 -0
data/spec/licensee/matchers/npm_bower_matcher_spec.rb +2 -0
data/spec/licensee/matchers/package_matcher_spec.rb +2 -0
data/spec/licensee/matchers/reference_matcher_spec.rb +2 -0
data/spec/licensee/matchers/spdx_matcher_spec.rb +2 -0
data/spec/licensee/project_files/license_file_spec.rb +4 -2
data/spec/licensee/project_files/package_info_spec.rb +2 -0
data/spec/licensee/project_files/project_file_spec.rb +3 -0
data/spec/licensee/project_files/readme_file_spec.rb +11 -0
data/spec/licensee/project_spec.rb +23 -3
data/spec/licensee/projects/git_project_spec.rb +23 -0
data/spec/licensee/projects/github_project_spec.rb +2 -0
data/spec/licensee/rule_spec.rb +2 -0
data/spec/licensee_spec.rb +3 -1
data/spec/spec_helper.rb +29 -9
data/spec/vendored_license_spec.rb +27 -8
data/vendor/choosealicense.com/_data/meta.yml +0 -4
data/vendor/choosealicense.com/_licenses/0bsd.txt +39 -0
data/vendor/choosealicense.com/_licenses/afl-3.0.txt +7 -6
data/vendor/choosealicense.com/_licenses/agpl-3.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/apache-2.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/artistic-2.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/bsd-2-clause.txt +8 -6
data/vendor/choosealicense.com/_licenses/bsd-3-clause-clear.txt +1 -2
data/vendor/choosealicense.com/_licenses/bsd-3-clause.txt +12 -10
data/vendor/choosealicense.com/_licenses/bsl-1.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/cc-by-4.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/cc-by-sa-4.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/cc0-1.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/cecill-2.1.txt +579 -0
data/vendor/choosealicense.com/_licenses/ecl-2.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/epl-1.0.txt +1 -2
data/vendor/choosealicense.com/_licenses/epl-2.0.txt +1 -2
data/vendor/choosealicense.com/_licenses/eupl-1.1.txt +0 -1
data/vendor/choosealicense.com/_licenses/eupl-1.2.txt +0 -1
data/vendor/choosealicense.com/_licenses/gpl-2.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/gpl-3.0.txt +1 -2
data/vendor/choosealicense.com/_licenses/isc.txt +0 -1
data/vendor/choosealicense.com/_licenses/lgpl-2.1.txt +0 -1
data/vendor/choosealicense.com/_licenses/lgpl-3.0.txt +1 -2
data/vendor/choosealicense.com/_licenses/lppl-1.3c.txt +0 -1
data/vendor/choosealicense.com/_licenses/mit.txt +0 -1
data/vendor/choosealicense.com/_licenses/mpl-2.0.txt +0 -1
data/vendor/choosealicense.com/_licenses/ms-pl.txt +0 -1
data/vendor/choosealicense.com/_licenses/ms-rl.txt +0 -1
data/vendor/choosealicense.com/_licenses/ncsa.txt +0 -1
data/vendor/choosealicense.com/_licenses/odbl-1.0.txt +573 -0
data/vendor/choosealicense.com/_licenses/ofl-1.1.txt +0 -1
data/vendor/choosealicense.com/_licenses/osl-3.0.txt +1 -2
data/vendor/choosealicense.com/_licenses/postgresql.txt +2 -3
data/vendor/choosealicense.com/_licenses/unlicense.txt +1 -2
data/vendor/choosealicense.com/_licenses/upl-1.0.txt +3 -4
data/vendor/choosealicense.com/_licenses/wtfpl.txt +0 -1
data/vendor/choosealicense.com/_licenses/zlib.txt +0 -1
metadata +41 -19

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 9d245d544a683bfaff9448e8654a409cea3b6ddc808ba7f7b966c84e031255ea
-  data.tar.gz: 801c8048260b692571828fce99aa754afeb99063849e3e330e7fbb0b1b7117db
+  metadata.gz: d7dc009b0467cfb305e8dac051ed4e78d2f35d0454f2e14cef0952338540f8ae
+  data.tar.gz: 3c27bb3dd3cea6d62fab826b81fab93d9152893851b541c91d69406cdf9fcbd8
 SHA512:
-  metadata.gz: 918b6abcf12b00722fb85c64112f2e11334a4b43d6f3b7d3335156a6bb21b0077432030f6c132aa0950af0b4860102a9a1c2d8548a5de5e1529852ae13e29ed1
-  data.tar.gz: 919b0c1a4390a2dd964e7f4d245194ef47e108e1336fc4f04b9795a97f17475ebc64dce12ff1bbd97bb805f5d1fc95774ece1878af17a89f4d57cf974582944b
+  metadata.gz: 07f19b33f70b0b73611d34e474f2aa4e4d7f62c7451cdf70f76774beceac2c75ab3d1cc5048061a848b979a54032aad6dd1ba278c79cd798029efd6873d54425
+  data.tar.gz: 96c5e66f65307e7feb2c00b3f06661b093c60995d049f7fd19cc27b76881965a1d33768a16b0a3a3b085e9392ef828dce9cf692ee04255dd9ea2c6d22da38da6

data/bin/licensee CHANGED

@@ -1,4 +1,5 @@
 #!/usr/bin/env ruby
+# frozen_string_literal: true
 require 'dotenv/load'
 require 'thor'

data/lib/licensee.rb CHANGED

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require_relative 'licensee/version'
 require 'forwardable'
 require 'pathname'
@@ -19,7 +21,7 @@ module Licensee
   CONFIDENCE_THRESHOLD = 98
   # Base domain from which to build license URLs
-  DOMAIN = 'http://choosealicense.com'.freeze
+  DOMAIN = 'http://choosealicense.com'
   class << self
     attr_writer :confidence_threshold
@@ -49,7 +51,7 @@ module Licensee
     end
     # Inverse of the confidence threshold, represented as a float
-    # By default this will be 0.05
+    # By default this will be 0.02
     def inverse_confidence_threshold
       @inverse_confidence_threshold ||=
         (1 - Licensee.confidence_threshold / 100.0).round(2)

data/lib/licensee/commands/detect.rb CHANGED

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 class LicenseeCLI < Thor
   # Methods to call when displaying information about ProjectFiles
   MATCHED_FILE_METHODS = %i[
@@ -40,8 +42,10 @@ class LicenseeCLI < Thor
       MATCHED_FILE_METHODS.each do |method|
         next unless matched_file.respond_to? method
         value = matched_file.public_send method
         next if value.nil?
         rows << [humanize(method, :method), humanize(value, method)]
       end
       print_table rows, indent: 2
@@ -49,8 +53,9 @@ class LicenseeCLI < Thor
       next unless matched_file.is_a? Licensee::ProjectFiles::LicenseFile
       next if matched_file.confidence == 100
-      licenses = licenses_by_similiarity(matched_file)
+      licenses = licenses_by_similarity(matched_file)
       next if licenses.empty?
       say '  Closest non-matching licenses:'
       rows = licenses[0...3].map do |license, similarity|
         spdx_id = license.meta['spdx-id']
@@ -89,15 +94,15 @@ class LicenseeCLI < Thor
     end
   end
-  def licenses_by_similiarity(matched_file)
+  def licenses_by_similarity(matched_file)
     matcher = Licensee::Matchers::Dice.new(matched_file)
     potential_licenses = Licensee.licenses(hidden: true).select(&:wordset)
     matcher.instance_variable_set('@potential_licenses', potential_licenses)
-    matcher.licenses_by_similiarity
+    matcher.licenses_by_similarity
   end
   def closest_license_key(matched_file)
-    licenses = licenses_by_similiarity(matched_file)
+    licenses = licenses_by_similarity(matched_file)
     licenses.first.first.key unless licenses.empty?
   end
 end

data/lib/licensee/commands/diff.rb CHANGED

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 require 'tmpdir'
 class LicenseeCLI < Thor
@@ -39,26 +41,23 @@ class LicenseeCLI < Thor
   def license_to_diff
     return options[:license_to_diff] if options[:license_to_diff]
-    return project.license_file if remote?
+    return project.license_file if remote? || STDIN.tty? && project.license_file
     @license_to_diff ||= begin
-      if STDIN.tty?
-        error 'You must pipe license contents to the command via STDIN'
-        exit 1
-      end
       Licensee::ProjectFiles::LicenseFile.new(STDIN.read, 'LICENSE')
     end
   end
   def expected_license
-    @expected_license ||= Licensee::License.find options[:license] if options[:license]
+    if options[:license]
+      @expected_license ||= Licensee::License.find options[:license]
+    end
     return @expected_license if @expected_license
     if options[:license]
       error "#{options[:license]} is not a valid license"
     else
-      error 'You must provide an expected license'
+      error 'Usage: provide a license to diff against with --license (spdx name)'
     end
     error "Valid licenses: #{Licensee::License.all(hidden: true).map(&:key).join(', ')}"

data/lib/licensee/commands/license_path.rb CHANGED

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 class LicenseeCLI < Thor
   desc 'license-path [PATH]', "Returns the path to the given project's license file"
   def license_path(_path)

data/lib/licensee/commands/version.rb CHANGED

@@ -1,3 +1,5 @@
+# frozen_string_literal: true
 class LicenseeCLI < Thor
   desc 'version', 'Return the Licensee version'
   def version

data/lib/licensee/content_helper.rb CHANGED

@@ -1,31 +1,105 @@
+# frozen_string_literal: true
 require 'set'
 require 'digest'
 module Licensee
   module ContentHelper
     DIGEST = Digest::SHA1
-    END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i
-    HR_REGEX = /[=\-\*][=\-\*\s]{3,}/
+    START_REGEX = /\A\s*/.freeze
+    END_OF_TERMS_REGEX = /^[\s#*_]*end of terms and conditions\s*$/i.freeze
     ALT_TITLE_REGEX = License::ALT_TITLE_REGEX
-    ALL_RIGHTS_RESERVED_REGEX = /\Aall rights reserved\.?$/i
-    WHITESPACE_REGEX = /\s+/
-    MARKDOWN_HEADING_REGEX = /\A\s*#+/
-    VERSION_REGEX = /\Aversion.*$/i
-    MARKUP_REGEX = /[#_*=~\[\]()`|>]+/
-    DEVELOPED_BY_REGEX = /\Adeveloped by:.*?\n\n/im
-    QUOTE_BEGIN_REGEX = /[`'"‘“]/
-    QUOTE_END_REGEX = /['"’”]/
+    REGEXES = {
+      hrs:                 /^\s*[=\-\*]{3,}\s*$/,
+      all_rights_reserved: /#{START_REGEX}all rights reserved\.?$/i,
+      whitespace:          /\s+/,
+      markdown_headings:   /#{START_REGEX}#+/,
+      version:             /#{START_REGEX}version.*$/i,
+      span_markup:         /[_*~]+(.*?)[_*~]+/,
+      link_markup:         /\[(.+?)\]\(.+?\)/,
+      block_markup:        /^\s*>/,
+      border_markup:       /^[\*-](.*?)[\*-]$/,
+      comment_markup:      %r{^\s*?[/\*]{1,2}},
+      url:                 %r{#{START_REGEX}https?://[^ ]+\n},
+      bullet:              /\n\n\s*(?:[*-]|\(?[\da-z]{1,2}[)\.])\s+/i,
+      developed_by:        /#{START_REGEX}developed by:.*?\n\n/im,
+      quote_begin:         /[`'"‘“]/,
+      quote_end:           /[`'"’”]/,
+      mit_optional:        /\(including the next paragraph\)/i
+    }.freeze
+    NORMALIZATIONS = {
+      lists:      { from: /^\s*(?:\d\.|\*)\s+([^\n])/, to: '- \1' },
+      https:      { from: /http:/, to: 'https:' },
+      ampersands: { from: '&', to: 'and' },
+      dashes:     { from: /(?<!^)([—–-]+)(?!$)/, to: '-' },
+      quotes:     {
+        from: /#{REGEXES[:quote_begin]}+([\w -]*?\w)#{REGEXES[:quote_end]}+/,
+        to:   '"\1"'
+      }
+    }.freeze
+    # Legally equivalent words that schould be ignored for comparison
+    # See https://spdx.org/spdx-license-list/matching-guidelines
+    VARIETAL_WORDS = {
+      'acknowledgment'  => 'acknowledgement',
+      'analogue'        => 'analog',
+      'analyse'         => 'analyze',
+      'artefact'        => 'artifact',
+      'authorisation'   => 'authorization',
+      'authorised'      => 'authorized',
+      'calibre'         => 'caliber',
+      'cancelled'       => 'canceled',
+      'capitalisations' => 'capitalizations',
+      'catalogue'       => 'catalog',
+      'categorise'      => 'categorize',
+      'centre'          => 'center',
+      'emphasised'      => 'emphasized',
+      'favour'          => 'favor',
+      'favourite'       => 'favorite',
+      'fulfil'          => 'fulfill',
+      'fulfilment'      => 'fulfillment',
+      'initialise'      => 'initialize',
+      'judgment'        => 'judgement',
+      'labelling'       => 'labeling',
+      'labour'          => 'labor',
+      'licence'         => 'license',
+      'maximise'        => 'maximize',
+      'modelled'        => 'modeled',
+      'modelling'       => 'modeling',
+      'offence'         => 'offense',
+      'optimise'        => 'optimize',
+      'organisation'    => 'organization',
+      'organise'        => 'organize',
+      'practise'        => 'practice',
+      'programme'       => 'program',
+      'realise'         => 'realize',
+      'recognise'       => 'recognize',
+      'signalling'      => 'signaling',
+      'sub-license'     => 'sublicense',
+      'sub license'     => 'sublicense',
+      'utilisation'     => 'utilization',
+      'whilst'          => 'while',
+      'wilful'          => 'wilfull',
+      'non-commercial'  => 'noncommercial',
+      'cent'            => 'percent',
+      'owner'           => 'holder'
+    }.freeze
+    STRIP_METHODS = %i[
+      hrs markdown_headings borders title version url copyright
+      block_markup span_markup link_markup
+      all_rights_reserved developed_by end_of_terms whitespace
+      mit_optional
+    ].freeze
     # A set of each word in the license, without duplicates
     def wordset
-      @wordset ||= if content_normalized
-        content_normalized.scan(/(?:\w(?:'s|(?<=s)')?)+/).to_set
-      end
+      @wordset ||= content_normalized&.scan(/(?:\w(?:'s|(?<=s)')?)+/)&.to_set
     end
     # Number of characteres in the normalized content
     def length
       return 0 unless content_normalized
       content_normalized.length
     end
@@ -43,8 +117,10 @@ module Licensee
     # Given another license or project file, calculates the similarity
     # as a percentage of words in common
     def similarity(other)
-      overlap = (wordset & other.wordset).size
-      total = wordset.size + other.wordset.size
+      wordset_fieldless = wordset - LicenseField.keys
+      fields_removed = wordset.size - wordset_fieldless.size
+      overlap = (wordset_fieldless & other.wordset).size
+      total = wordset_fieldless.size + other.wordset.size - fields_removed
       100.0 * (overlap * 2.0 / total)
     end
@@ -59,34 +135,21 @@ module Licensee
     # content with attribution first to detect attribuion in LicenseFile
     def content_without_title_and_version
       @content_without_title_and_version ||= begin
-        string = content.strip
-        string = strip_markdown_headings(string)
-        string = strip_hrs(string)
-        string = strip_title(string) while string =~ ContentHelper.title_regex
-        strip_version(string).strip
+        @_content = nil
+        ops = %i[html hrs comments markdown_headings title version]
+        ops.each { |op| strip(op) }
+        _content
       end
     end
-    # Content without title, version, copyright, whitespace, or insturctions
-    #
-    # wrap - Optional width to wrap the content
-    #
-    # Returns a string
     def content_normalized(wrap: nil)
-      return unless content
       @content_normalized ||= begin
-        string = content_without_title_and_version.downcase
-        while string =~ Matchers::Copyright::REGEX
-          string = strip_copyright(string)
-        end
-        string = strip_all_rights_reserved(string)
-        string = strip_developed_by(string)
-        string, _partition, _instructions = string.partition(END_OF_TERMS_REGEX)
-        string = normalize_lists(string)
-        string = normalize_quotes(string)
-        string = normalize_https(string)
-        string = strip_markup(string)
-        strip_whitespace(string)
+        @_content = content_without_title_and_version.downcase
+        (NORMALIZATIONS.keys + %i[spelling bullets]).each { |op| normalize(op) }
+        STRIP_METHODS.each { |op| strip(op) }
+        _content
       end
       if wrap.nil?
@@ -96,14 +159,24 @@ module Licensee
       end
     end
+    # Backwards compatibalize constants to avoid a breaking change
+    def self.const_missing(const)
+      key = const.to_s.downcase.gsub('_regex', '').to_sym
+      REGEXES[key] || super
+    end
     # Wrap text to the given line length
     def self.wrap(text, line_width = 80)
       return if text.nil?
       text = text.clone
+      text.gsub!(REGEXES[:bullet]) { |m| "\n#{m}\n" }
       text.gsub!(/([^\n])\n([^\n])/, '\1 \2')
       text = text.split("\n").collect do |line|
-        if line.length > line_width
+        if line =~ REGEXES[:hrs]
+          line
+        elsif line.length > line_width
           line.gsub(/(.{1,#{line_width}})(\s+|$)/, "\\1\n").strip
         else
           line
@@ -114,82 +187,114 @@ module Licensee
     end
     def self.format_percent(float)
-      "#{format('%.2f', float)}%"
+      "#{format('%<float>.2f', float: float)}%"
     end
     def self.title_regex
-      licenses = Licensee::License.all(hidden: true, psuedo: false)
-      titles = licenses.map(&:title_regex)
-      # Title regex must include the version to support matching within
-      # families, but for sake of normalization, we can be less strict
-      without_versions = licenses.map do |license|
-        next if license.title == license.name_without_version
-        Regexp.new Regexp.escape(license.name_without_version), 'i'
-      end
-      titles.concat(without_versions.compact)
+      @title_regex ||= begin
+        licenses = Licensee::License.all(hidden: true, psuedo: false)
+        titles = licenses.map(&:title_regex)
+        # Title regex must include the version to support matching within
+        # families, but for sake of normalization, we can be less strict
+        without_versions = licenses.map do |license|
+          next if license.title == license.name_without_version
+          Regexp.new Regexp.escape(license.name_without_version), 'i'
+        end
+        titles.concat(without_versions.compact)
-      /\A\s*\(?(the )?#{Regexp.union titles}.*$/i
+        /#{START_REGEX}\(?(?:the )?#{Regexp.union titles}.*?$/i
+      end
     end
     private
-    def strip_title(string)
-      strip(string, ContentHelper.title_regex)
+    def _content
+      @_content ||= content.to_s.dup.strip
     end
-    def strip_version(string)
-      strip(string, VERSION_REGEX)
+    def strip(regex_or_sym)
+      return unless _content
+      if regex_or_sym.is_a?(Symbol)
+        meth = "strip_#{regex_or_sym}"
+        return send(meth) if respond_to?(meth, true)
+        unless REGEXES[regex_or_sym]
+          raise ArgumentError, "#{regex_or_sym} is an invalid regex reference"
+        end
+        regex_or_sym = REGEXES[regex_or_sym]
+      end
+      @_content = _content.gsub(regex_or_sym, ' ').squeeze(' ').strip
     end
-    def strip_copyright(string)
-      strip(string, Matchers::Copyright::REGEX)
+    def strip_title
+      while _content =~ ContentHelper.title_regex
+        strip(ContentHelper.title_regex)
+      end
     end
-    # Strip HRs from MPL
-    def strip_hrs(string)
-      strip(string, HR_REGEX)
+    def strip_borders
+      normalize(REGEXES[:border_markup], '\1')
     end
-    # Strip leading #s from the document
-    def strip_markdown_headings(string)
-      strip(string, MARKDOWN_HEADING_REGEX)
+    def strip_comments
+      lines = _content.split("\n")
+      return if lines.count == 1
+      return unless lines.all? { |line| line =~ REGEXES[:comment_markup] }
+      strip(:comment_markup)
     end
-    def strip_whitespace(string)
-      strip(string, WHITESPACE_REGEX)
+    def strip_copyright
+      regex = Matchers::Copyright::REGEX
+      strip(regex) while _content =~ regex
     end
-    def strip_all_rights_reserved(string)
-      strip(string, ALL_RIGHTS_RESERVED_REGEX)
+    def strip_end_of_terms
+      body, _partition, _instructions = _content.partition(END_OF_TERMS_REGEX)
+      @_content = body
     end
-    def strip_markup(string)
-      strip(string, MARKUP_REGEX)
+    def strip_span_markup
+      normalize(REGEXES[:span_markup], '\1')
     end
-    def strip_developed_by(string)
-      strip(string, DEVELOPED_BY_REGEX)
+    def strip_link_markup
+      normalize(REGEXES[:link_markup], '\1')
     end
-    def strip(string, regex)
-      string.gsub(regex, ' ').squeeze(' ').strip
+    def strip_html
+      return unless respond_to?(:filename) && filename
+      return unless File.extname(filename) =~ /\.html?/i
+      require 'reverse_markdown'
+      @_content = ReverseMarkdown.convert(_content, unknown_tags: :bypass)
     end
-    # Replace all enclosing quotes with double quotes
-    # Single versus double quotes don't alter the meaning, and it's easier to
-    # strip double quotes if we still want to allow possessives
-    def normalize_quotes(string)
-      string.gsub(/#{QUOTE_BEGIN_REGEX}+([\w -]*?\w)#{QUOTE_END_REGEX}+/,
-                  '"\1"')
+    def normalize(from_or_key, to = nil)
+      operation = { from: from_or_key, to: to } if to
+      operation ||= NORMALIZATIONS[from_or_key]
+      if operation
+        @_content = _content.gsub operation[:from], operation[:to]
+      elsif respond_to?("normalize_#{from_or_key}", true)
+        send("normalize_#{from_or_key}")
+      else
+        raise ArgumentError, "#{from_or_key} is an invalid normalization"
+      end
     end
-    def normalize_https(string)
-      string.gsub(/http:/, 'https:')
+    def normalize_spelling
+      normalize(/\b#{Regexp.union(VARIETAL_WORDS.keys)}\b/, VARIETAL_WORDS)
     end
-    def normalize_lists(string)
-      string.gsub(/^\s*(\d\.|\*)/, '-')
+    def normalize_bullets
+      normalize(REGEXES[:bullet], "\n\n* ")
+      normalize(/\)\s+\(/, ')(')
     end
   end
 end