RubyGems - suma - Versions diffs - 0.3.0 → 0.4.0 - Mend

suma 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +4 -4
data/.gitignore +2 -5
data/README.adoc +70 -12
data/lib/suma/cli/check_svg_quality.rb +82 -91
data/lib/suma/cli/export.rb +37 -19
data/lib/suma/cli/reformat.rb +39 -63
data/lib/suma/cli/validate.rb +14 -5
data/lib/suma/cli/validate_links.rb +11 -155
data/lib/suma/express_reformatter.rb +94 -0
data/lib/suma/link_validation.rb +144 -0
data/lib/suma/link_validator.rb +2 -1
data/lib/suma/schema_collection.rb +1 -1
data/lib/suma/schema_exporter.rb +14 -26
data/lib/suma/svg_quality/scanner.rb +61 -0
data/lib/suma/svg_quality.rb +1 -0
data/lib/suma/term_classification.rb +78 -0
data/lib/suma/term_extractor.rb +16 -45
data/lib/suma/version.rb +1 -1
data/lib/suma.rb +4 -0
metadata +6 -2

data/lib/suma/cli/validate_links.rb CHANGED Viewed

@@ -1,168 +1,24 @@
 # frozen_string_literal: true
 require "thor"
-require "expressir"
 module Suma
   module Cli
+    # Deprecated: prefer +Cli::Validate#links+ (the +suma validate links+
+    # subcommand). This class is retained as a backwards-compat entry
+    # point — all orchestration now lives in +Suma::LinkValidation+.
     class ValidateLinks < Thor
       desc "extract_and_validate SCHEMAS_FILE DOCUMENTS_PATH [OUTPUT_FILE]",
            "Extract and validate express links without creating intermediate file"
       def extract_and_validate(schemas_file = "schemas-srl.yml",
-                              documents_path = "documents",
-                              output_file = "validation_results.txt")
-        load_dependencies
-        paths = prepare_file_paths(schemas_file, documents_path, output_file)
-        schemas_config = load_schemas_config(paths[:schemas_file])
-        exp_files = collect_schema_paths(schemas_config, paths[:schemas_file_rel])
-        adoc_files = find_adoc_files(paths[:documents_path])
-        all_files = adoc_files + exp_files
-        display_file_counts(adoc_files, exp_files)
-        links_by_file = extract_links(all_files)
-        repo = load_express_schemas(schemas_config)
-        index = SchemaIndex.new(repo)
-        unresolved = LinkValidator.new(index).validate(links_by_file)
-        write_validation_results(paths[:output_file], paths[:output_file_rel],
-                                 unresolved, links_by_file)
-      end
-      private
-      def load_dependencies
-        require "expressir"
-        require "ruby-progressbar"
-        require "pathname"
-      end
-      def prepare_file_paths(schemas_file, documents_path, output_file)
-        schemas_file_path = Pathname.new(schemas_file).expand_path
-        documents_path_exp = Pathname.new(documents_path).expand_path
-        output_file_path = Pathname.new(output_file).expand_path
-        schemas_file_rel = Pathname.new(schemas_file_path).relative_path_from(Pathname.pwd).to_s
-        documents_path_rel = Pathname.new(documents_path_exp).relative_path_from(Pathname.pwd).to_s
-        output_file_rel = Pathname.new(output_file_path).relative_path_from(Pathname.pwd).to_s
-        puts "Extracting and validating express links using schemas from #{schemas_file_rel}..."
-        puts "Looking for documents in #{documents_path_rel}..."
-        {
-          schemas_file: schemas_file_path,
-          schemas_file_rel: schemas_file_rel,
-          documents_path: documents_path_exp,
-          documents_path_rel: documents_path_rel,
-          output_file: output_file_path,
-          output_file_rel: output_file_rel,
-        }
-      end
-      def load_schemas_config(schemas_file_path)
-        schemas_config = Expressir::SchemaManifest.from_yaml(File.read(schemas_file_path))
-        schemas_config.set_initial_path(schemas_file_path.to_s)
-        schemas_config
-      rescue StandardError => e
-        raise Suma::Error, "Error loading schemas file: #{e.message}"
-      end
-      def collect_schema_paths(schemas_config, schemas_file_rel)
-        exp_files = schemas_config.schemas.filter_map(&:path)
-        puts "Found #{exp_files.size} EXPRESS schema files from #{schemas_file_rel}"
-        exp_files
-      end
-      def find_adoc_files(documents_path)
-        Dir.glob(documents_path.join("**", "*.adoc").to_s)
-      end
-      def display_file_counts(adoc_files, exp_files)
-        puts "Found #{adoc_files.size} AsciiDoc files and #{exp_files.size} EXPRESS files"
-      end
-      def create_progress_bar(title, total)
-        ProgressBar.create(
-          title: title,
-          total: total,
-          format: "%t: [%B] %p%% %c/%C %e",
-          progress_mark: "=",
-          remainder_mark: " ",
-          length: 80,
-        )
-      end
-      def extract_links(files)
-        links_by_file = {}
-        link_count = 0
-        progress = create_progress_bar("Processing files", files.size)
-        files.each do |file|
-          progress.increment
-          begin
-            content = File.read(file)
-            express_links = content.scan(/<<express:([^,>]+)(?:,[^>]+)?>>/).flatten.uniq
-            if express_links.any?
-              links_by_file[file] = express_links
-              link_count += express_links.size
-            end
-          rescue StandardError => e
-            puts "\nWarning: Could not read file #{file}: #{e.message}"
-          end
-        end
-        puts "\nExtracted #{link_count} unique express links from #{links_by_file.size} files"
-        links_by_file
-      end
-      def load_express_schemas(schemas_config)
-        schema_paths = {}
-        schemas_config.schemas.each { |s| schema_paths[s.id] = s.path }
-        puts "Loading #{schema_paths.size} EXPRESS schemas for validation..."
-        loading_progress = create_progress_bar("Loading schemas", schema_paths.size)
-        begin
-          repo = Expressir::Express::Parser.from_files(schema_paths.values) do |filename, _schemas, error|
-            loading_progress.increment
-            puts "\nWarning: Error loading schema #{filename}: #{error.message}" if error
-          end
-          puts "Successfully loaded #{repo.schemas.size} schemas"
-          repo
-        rescue StandardError => e
-          raise Suma::Error, "Error loading schemas: #{e.message}"
-        end
-      end
-      def write_validation_results(output_file_path, output_file_rel,
-  unresolved_links, links_by_file)
-        total_links = links_by_file.values.sum(&:size)
-        results = []
-        results << "Validation complete. Checked #{total_links} links."
-        if unresolved_links.empty?
-          results << "✅ All links resolved successfully!"
-        else
-          results << "❌ Found #{unresolved_links.size} unresolved links:"
-          unresolved_links.each do |issue|
-            results << "#{issue.file}:#{issue.line} - <<express:#{issue.link}>> - #{issue.reason}"
-          end
-        end
-        begin
-          File.write(output_file_path, results.join("\n"))
-          puts "Validation results written to #{output_file_rel}"
-        rescue StandardError => e
-          puts "Error writing to output file: #{e.message}"
-          puts results
-        end
+                               documents_path = "documents",
+                               output_file = "validation_results.txt")
+        result = LinkValidation.new(
+          schemas_file: schemas_file,
+          documents_path: documents_path,
+          output_file: output_file,
+        ).call
+        puts LinkValidation.generate_summary(result)
       end
     end
   end

data/lib/suma/express_reformatter.rb ADDED Viewed

@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+module Suma
+  # Reformats EXPRESS source into suma's canonical form: comment blocks
+  # (the +(*"...\n*)+ remarks that EXPRESS uses for documentation) are
+  # extracted from their inline positions and appended to the end of
+  # the file, and excess blank lines are collapsed.
+  #
+  # Pure content transformation — no I/O, no Thor, no filesystem. The
+  # CLI adapter handles reading from and writing to disk.
+  #
+  # The transform is idempotent: reformatting an already-reformatted
+  # document returns +changed?: false+.
+  #
+  # Comment extraction uses +String#index+ rather than a single m-mode
+  # regex. The original +/\(\*"(.*?)\n\*\)/m+ is correct functionally
+  # but is polynomial-time on adversarial input (CodeQL
+  # +rb/polynomial-redos+). Scanning for the start marker, then for
+  # the next terminator, is unambiguously linear.
+  module ExpressReformatter
+    Result = Struct.new(:content, :changed?, keyword_init: true) do
+      def content_or_nil
+        changed? ? content : nil
+      end
+    end
+    COMMENT_START = '(*"'
+    COMMENT_END = "\n*)"
+    BLANK_RUN_PATTERN = /(\n\n+)/
+    NEWLINE_RUN_PATTERN = /(\n+)/
+    module_function
+    def call(content)
+      comments = extract_comments(content)
+      return Result.new(content: content, changed?: false) if comments.empty?
+      without_comments = strip_comments(content)
+      new_comments = comments.map { |c| "#{COMMENT_START}#{c}#{COMMENT_END}" }
+        .join("\n\n")
+      new_content = "#{without_comments}\n\n#{new_comments}\n"
+      new_content = new_content.gsub(BLANK_RUN_PATTERN, "\n\n")
+      changed = normalised_compare(content, new_content) != 0
+      Result.new(content: new_content, changed?: changed)
+    end
+    def extract_comments(content)
+      comments = []
+      pos = 0
+      while (start_idx = content.index(COMMENT_START, pos))
+        end_idx = content.index(COMMENT_END, start_idx + COMMENT_START.length)
+        break unless end_idx
+        comments << content[(start_idx + COMMENT_START.length)...end_idx]
+        pos = end_idx + COMMENT_END.length
+      end
+      comments
+    end
+    def strip_comments(content)
+      return content unless content.index(COMMENT_START, 0)
+      stripped = +""
+      last_end = 0
+      each_comment_range(content) do |range|
+        stripped << content[range[:pre_start]...range[:start]]
+        last_end = range[:end_exclusive]
+      end
+      stripped << content[last_end..]
+      stripped.force_encoding(content.encoding)
+    end
+    def each_comment_range(content)
+      return enum_for(:each_comment_range, content) unless block_given?
+      pos = 0
+      while (start_idx = content.index(COMMENT_START, pos))
+        end_idx = content.index(COMMENT_END, start_idx + COMMENT_START.length)
+        break unless end_idx
+        yield pre_start: pos, start: start_idx,
+              end_exclusive: end_idx + COMMENT_END.length
+        pos = end_idx + COMMENT_END.length
+      end
+    end
+    def normalised_compare(left, right)
+      left.gsub(NEWLINE_RUN_PATTERN, "\n") <=> right.gsub(NEWLINE_RUN_PATTERN, "\n")
+    end
+    private_class_method :normalised_compare
+  end
+end

data/lib/suma/link_validation.rb ADDED Viewed

@@ -0,0 +1,144 @@
+# frozen_string_literal: true
+require "pathname"
+require "expressir"
+module Suma
+  # Deep module behind the EXPRESS cross-reference validation pipeline.
+  #
+  # Interface: paths in, +Result+ out. The CLI is a thin adapter that
+  # constructs this object and prints the result.
+  #
+  # Owns: loading the schemas manifest, discovering + reading .adoc and
+  # .exp files, extracting express cross-reference links, loading parsed
+  # schemas into a +SchemaIndex+, delegating to +LinkValidator+ for the
+  # actual resolution, and writing the summary file.
+  #
+  # Does not own: presentation (use +LinkValidation.generate_summary+),
+  # command-line argument parsing (the CLI adapter does that), or the
+  # link-resolution rules themselves (that is +LinkValidator+'s job).
+  class LinkValidation
+    EXPRESS_LINK_PATTERN = /<<express:([^,>]+)(?:,[^>]+)?>>/
+    attr_reader :schemas_file, :documents_path, :output_file,
+                :progress, :logger
+    def initialize(schemas_file:, documents_path:, output_file:,
+                   progress: NullProgress.new, logger: Utils)
+      @schemas_file = Pathname.new(schemas_file).expand_path
+      @documents_path = Pathname.new(documents_path).expand_path
+      @output_file = output_file && Pathname.new(output_file).expand_path
+      @progress = progress
+      @logger = logger
+    end
+    def call
+      config = load_schemas_config
+      exp_files = collect_schema_paths(config)
+      adoc_files = find_adoc_files
+      links_by_file = extract_links(adoc_files + exp_files)
+      unresolved = validate_links(config, links_by_file)
+      result = Result.new(
+        adoc_count: adoc_files.size,
+        exp_count: exp_files.size,
+        total_links: links_by_file.values.sum(&:size),
+        unresolved: unresolved,
+      )
+      write_summary(result) if output_file
+      result
+    end
+    def self.generate_summary(result)
+      lines = []
+      lines << "Validation complete. Checked #{result.total_links} links."
+      if result.success?
+        lines << "✅ All links resolved successfully!"
+      else
+        lines << "❌ Found #{result.unresolved.size} unresolved links:"
+        result.unresolved.each { |issue| lines << format_issue(issue) }
+      end
+      lines.join("\n")
+    end
+    def self.format_issue(issue)
+      "#{issue.file}:#{issue.line} - " \
+        "<<express:#{issue.link}>> - #{issue.reason}"
+    end
+    private_class_method :format_issue
+    # Default no-op progress adapter. Callers that want a real progress
+    # bar pass an object responding to +#start(title, total)+ and
+    # +#increment+; this satisfies the same interface without forcing
+    # the dependency.
+    class NullProgress
+      def start(_title, _total); end
+      def increment; end
+    end
+    Result = Struct.new(
+      :adoc_count,
+      :exp_count,
+      :total_links,
+      :unresolved,
+      keyword_init: true,
+    ) do
+      def success?
+        unresolved.empty?
+      end
+    end
+    private
+    def load_schemas_config
+      Expressir::SchemaManifest.from_yaml(File.read(schemas_file))
+        .tap { |c| c.set_initial_path(schemas_file.to_s) }
+    rescue StandardError => e
+      raise Error, "Error loading schemas file: #{e.message}"
+    end
+    def collect_schema_paths(schemas_config)
+      schemas_config.schemas.filter_map(&:path)
+    end
+    def find_adoc_files
+      Dir.glob(documents_path.join("**", "*.adoc").to_s)
+    end
+    def extract_links(files)
+      links_by_file = {}
+      progress.start("Processing files", files.size)
+      files.each do |file|
+        links = extract_links_from_file(file)
+        links_by_file[file] = links if links&.any?
+      end
+      links_by_file
+    end
+    def extract_links_from_file(file)
+      progress.increment
+      content = File.read(file)
+      content.scan(EXPRESS_LINK_PATTERN).flatten.uniq
+    rescue StandardError => e
+      logger.log "Warning: Could not read file #{file}: #{e.message}"
+      nil
+    end
+    def validate_links(schemas_config, links_by_file)
+      paths_by_id = schemas_config.schemas.to_h { |s| [s.id, s.path] }
+      progress.start("Loading schemas", paths_by_id.size)
+      repo = Expressir::Express::Parser.from_files(paths_by_id.values) do |*_args|
+        progress.increment
+      end
+      index = SchemaIndex.new(repo)
+      LinkValidator.new(index).validate(links_by_file)
+    end
+    def write_summary(result)
+      FileUtils.mkdir_p(output_file.dirname)
+      File.write(output_file, self.class.generate_summary(result))
+    rescue StandardError => e
+      logger.log "Error writing to output file: #{e.message}"
+    end
+  end
+end

data/lib/suma/link_validator.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "expressir"
+require "suma/link_validation"
 module Suma
   LinkValidationResult = Struct.new(:file, :line, :link, :reason,
@@ -28,7 +29,7 @@ module Suma
       content = File.read(file)
       index = {}
       content.lines.each_with_index do |line, idx|
-        line.scan(/<<express:([^,>]+)(?:,[^>]+)?>>/).flatten.each do |link|
+        line.scan(LinkValidation::EXPRESS_LINK_PATTERN).flatten.each do |link|
           index[link] ||= idx
         end
       end

data/lib/suma/schema_collection.rb CHANGED Viewed

@@ -40,7 +40,7 @@ module Suma
       finalize
       exporter = SchemaExporter.new(
-        schemas: @config.schemas,
+        schemas: schemas.values,
         output_path: @output_path_schemas,
         options: { annotations: false },
       )

data/lib/suma/schema_exporter.rb CHANGED Viewed

@@ -3,8 +3,19 @@
 require "fileutils"
 module Suma
-  # SchemaExporter exports EXPRESS schemas from a manifest
-  # with configurable options for annotations and ZIP packaging
+  # Exports EXPRESS schemas to a directory, with optional ZIP packaging.
+  #
+  # Pure sink: the exporter accepts already-loaded +Suma::ExpressSchema+
+  # instances and writes their content to disk. Construction of those
+  # instances (with the right +output_path+ and +is_standalone_file+
+  # flags) is the caller's responsibility — the exporter does not
+  # reach across the seam to inspect manifest entries or classify
+  # schema types itself.
+  #
+  # This is a deep module: a small interface (one +export+ method, one
+  # option hash) backed by save_exp + zip packaging. The CLI and
+  # SchemaCollection adapters construct ExpressSchema instances; the
+  # exporter never inspects their shape.
   class SchemaExporter
     attr_reader :schemas, :output_path, :options
@@ -35,30 +46,7 @@ module Suma
     def export_to_directory(schemas)
       schemas.each do |schema|
-        export_single_schema(schema)
-      end
-    end
-    def export_single_schema(schema)
-      is_standalone = !schema.is_a?(Expressir::SchemaManifestEntry)
-      schema_output_path = determine_output_path(schema, is_standalone)
-      express_schema = ExpressSchema.new(
-        id: schema.id,
-        path: schema.path.to_s,
-        output_path: schema_output_path,
-        is_standalone_file: is_standalone,
-      )
-      express_schema.save_exp(with_annotations: options[:annotations])
-    end
-    def determine_output_path(schema, is_standalone)
-      if is_standalone
-        output_path.to_s
-      else
-        category = SchemaCategory.for_schema(id: schema.id, path: schema.path)
-        output_path.join(category.directory).to_s
+        schema.save_exp(with_annotations: options[:annotations])
       end
     end

data/lib/suma/svg_quality/scanner.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+require "svg_conform"
+module Suma
+  module SvgQuality
+    # Deep module behind the SVG-quality seam: takes paths in, returns
+    # +Report+ / +BatchReport+ out. Owns validator construction and
+    # per-file result capture. Does not own file discovery, sorting,
+    # filtering, or presentation — those stay in the CLI adapter.
+    #
+    # The progress adapter is injected: pass any object responding to
+    # +#call(index, total, report)+. +NullProgress+ is the default
+    # no-op; the CLI passes a lambda that writes to +$stderr+.
+    class Scanner
+      DEFAULT_PROFILE = :metanorma
+      attr_reader :profile, :progress
+      def initialize(profile: DEFAULT_PROFILE,
+                     progress: NullProgress.new)
+        @profile = profile
+        @progress = progress
+      end
+      def scan(paths)
+        validator = build_validator
+        reports = paths.each_with_index.map do |path, index|
+          scan_one(validator, path, index, paths.size)
+        end
+        BatchReport.new(reports)
+      end
+      def scan_file(path)
+        Report.new(path.to_s, build_validator.validate_file(path.to_s,
+                                                            profile: profile))
+      end
+      # Default no-op progress adapter. Real progress reporters are
+      # passed by the caller; this satisfies the same interface so the
+      # scanner can be invoked from specs without forcing the
+      # dependency.
+      class NullProgress
+        def call(_index, _total, _report); end
+      end
+      private
+      def build_validator
+        SvgConform::Validator.new
+      end
+      def scan_one(validator, path, index, total)
+        result = validator.validate_file(path.to_s, profile: profile)
+        report = Report.new(path.to_s, result)
+        progress.call(index, total, report)
+        report
+      end
+    end
+  end
+end

data/lib/suma/svg_quality.rb CHANGED Viewed

@@ -6,6 +6,7 @@ module Suma
   module SvgQuality
     autoload :Report,      "suma/svg_quality/report"
     autoload :BatchReport, "suma/svg_quality/batch_report"
+    autoload :Scanner,     "suma/svg_quality/scanner"
     module QualityTiers
       CRITICAL = { name: :critical, min_errors: 200, emoji: "💥" }.freeze

data/lib/suma/term_classification.rb ADDED Viewed

@@ -0,0 +1,78 @@
+# frozen_string_literal: true
+module Suma
+  # Term-extractor-specific classification of an EXPRESS schema.
+  #
+  # Bridges ExpressSchema::Type (the canonical classification shared
+  # across the codebase) and the Glossarist-specific labels
+  # TermExtractor emits: the domain string ("application module" /
+  # "resource") that goes into a concept's +domain+ field, and the
+  # entity-type URN term used in generated concept definitions.
+  #
+  # The mapping is data — a frozen Hash keyed by ExpressSchema::Type
+  # symbol — so adding a new schema type is a one-line addition to
+  # BY_TYPE (open/closed principle). The previous implementation
+  # switched on string keys in three separate places; this consolidates
+  # them into one source of truth.
+  class TermClassification
+    attr_reader :type, :domain_label, :entity_term, :entity_display
+    def initialize(type:, domain_label:, entity_term:, entity_display:)
+      @type = type
+      @domain_label = domain_label
+      @entity_term = entity_term
+      @entity_display = entity_display
+      freeze
+    end
+    def domain_for(schema_id)
+      "#{domain_label}: #{schema_id}"
+    end
+    BY_TYPE = {
+      ExpressSchema::Type::RESOURCE => new(
+        type: ExpressSchema::Type::RESOURCE,
+        domain_label: "resource",
+        entity_term: "express-language.entity_data_type",
+        entity_display: "entity data type",
+      ),
+      ExpressSchema::Type::MODULE_ARM => new(
+        type: ExpressSchema::Type::MODULE_ARM,
+        domain_label: "application module",
+        entity_term: "general.application_object",
+        entity_display: "application object",
+      ),
+      ExpressSchema::Type::MODULE_MIM => new(
+        type: ExpressSchema::Type::MODULE_MIM,
+        domain_label: "application module",
+        entity_term: "express-language.entity_data_type",
+        entity_display: "entity data type",
+      ),
+      ExpressSchema::Type::BUSINESS_OBJECT_MODEL => new(
+        type: ExpressSchema::Type::BUSINESS_OBJECT_MODEL,
+        domain_label: "resource",
+        entity_term: "express-language.entity_data_type",
+        entity_display: "entity data type",
+      ),
+      ExpressSchema::Type::CORE_MODEL => new(
+        type: ExpressSchema::Type::CORE_MODEL,
+        domain_label: "resource",
+        entity_term: "express-language.entity_data_type",
+        entity_display: "entity data type",
+      ),
+      ExpressSchema::Type::STANDALONE => new(
+        type: ExpressSchema::Type::STANDALONE,
+        domain_label: "resource",
+        entity_term: "express-language.entity_data_type",
+        entity_display: "entity data type",
+      ),
+    }.freeze
+    def self.for_schema(id:, path:)
+      type = ExpressSchema::Type.classify(id: id, path: path)
+      BY_TYPE.fetch(type) do |t|
+        raise Error, "[suma] no term classification for type #{t.inspect}"
+      end
+    end
+  end
+end