RubyGems - kumi - Versions diffs - 0.0.13 → 0.0.15 - Mend

kumi 0.0.13 → 0.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (73) hide show

checksums.yaml +4 -4
data/.rspec +0 -1
data/BACKLOG.md +34 -0
data/CHANGELOG.md +33 -0
data/CLAUDE.md +4 -6
data/README.md +0 -45
data/config/functions.yaml +352 -0
data/docs/dev/analyzer-debug.md +52 -0
data/docs/dev/parse-command.md +64 -0
data/docs/dev/vm-profiling.md +95 -0
data/docs/features/README.md +0 -7
data/docs/functions/analyzer_integration.md +199 -0
data/docs/functions/signatures.md +171 -0
data/examples/hash_objects_demo.rb +138 -0
data/golden/array_operations/schema.kumi +17 -0
data/golden/cascade_logic/schema.kumi +16 -0
data/golden/mixed_nesting/schema.kumi +42 -0
data/golden/simple_math/schema.kumi +10 -0
data/lib/kumi/analyzer.rb +76 -22
data/lib/kumi/compiler.rb +6 -5
data/lib/kumi/core/analyzer/checkpoint.rb +72 -0
data/lib/kumi/core/analyzer/debug.rb +167 -0
data/lib/kumi/core/analyzer/passes/broadcast_detector.rb +1 -3
data/lib/kumi/core/analyzer/passes/function_signature_pass.rb +199 -0
data/lib/kumi/core/analyzer/passes/ir_dependency_pass.rb +67 -0
data/lib/kumi/core/analyzer/passes/load_input_cse.rb +120 -0
data/lib/kumi/core/analyzer/passes/lower_to_ir_pass.rb +72 -157
data/lib/kumi/core/analyzer/passes/toposorter.rb +40 -36
data/lib/kumi/core/analyzer/state_serde.rb +64 -0
data/lib/kumi/core/analyzer/structs/access_plan.rb +12 -10
data/lib/kumi/core/compiler/access_planner.rb +3 -2
data/lib/kumi/core/function_registry/collection_functions.rb +3 -1
data/lib/kumi/core/functions/dimension.rb +98 -0
data/lib/kumi/core/functions/dtypes.rb +20 -0
data/lib/kumi/core/functions/errors.rb +11 -0
data/lib/kumi/core/functions/kernel_adapter.rb +45 -0
data/lib/kumi/core/functions/loader.rb +119 -0
data/lib/kumi/core/functions/registry_v2.rb +68 -0
data/lib/kumi/core/functions/shape.rb +70 -0
data/lib/kumi/core/functions/signature.rb +122 -0
data/lib/kumi/core/functions/signature_parser.rb +86 -0
data/lib/kumi/core/functions/signature_resolver.rb +272 -0
data/lib/kumi/core/ir/execution_engine/interpreter.rb +110 -7
data/lib/kumi/core/ir/execution_engine/profiler.rb +330 -0
data/lib/kumi/core/ir/execution_engine.rb +6 -15
data/lib/kumi/dev/ir.rb +75 -0
data/lib/kumi/dev/parse.rb +105 -0
data/lib/kumi/dev/profile_aggregator.rb +301 -0
data/lib/kumi/dev/profile_runner.rb +199 -0
data/lib/kumi/dev/runner.rb +85 -0
data/lib/kumi/dev.rb +14 -0
data/lib/kumi/frontends/ruby.rb +28 -0
data/lib/kumi/frontends/text.rb +46 -0
data/lib/kumi/frontends.rb +29 -0
data/lib/kumi/kernels/ruby/aggregate_core.rb +105 -0
data/lib/kumi/kernels/ruby/datetime_scalar.rb +21 -0
data/lib/kumi/kernels/ruby/mask_scalar.rb +15 -0
data/lib/kumi/kernels/ruby/scalar_core.rb +63 -0
data/lib/kumi/kernels/ruby/string_scalar.rb +19 -0
data/lib/kumi/kernels/ruby/vector_struct.rb +39 -0
data/lib/kumi/runtime/executable.rb +108 -45
data/lib/kumi/schema.rb +12 -6
data/lib/kumi/support/diff.rb +22 -0
data/lib/kumi/support/ir_render.rb +61 -0
data/lib/kumi/version.rb +1 -1
data/lib/kumi.rb +3 -0
data/performance_results.txt +63 -0
data/scripts/test_mixed_nesting_performance.rb +206 -0
metadata +50 -6
data/docs/features/analysis-cascade-mutual-exclusion.md +0 -89
data/docs/features/javascript-transpiler.md +0 -148
data/lib/kumi/js.rb +0 -23
data/lib/kumi/support/ir_dump.rb +0 -491

data/lib/kumi/core/ir/execution_engine/profiler.rb ADDED Viewed

@@ -0,0 +1,330 @@
+# frozen_string_literal: true
+require "json"
+require "fileutils"
+require "time"
+require "set"
+module Kumi
+  module Core
+    module IR
+      module ExecutionEngine
+        module Profiler
+          class << self
+            def enabled? = ENV["KUMI_PROFILE"] == "1"
+            def ops_enabled? = ENV.fetch("KUMI_PROFILE_OPS", "1") == "1"
+            def sample_rate = (ENV["KUMI_PROFILE_SAMPLE"]&.to_i || 1)
+            def persistent? = ENV["KUMI_PROFILE_PERSISTENT"] == "1"
+            def set_schema_name(name)
+              @schema_name = name
+              # Ensure profiler is initialized in persistent mode
+              unless @initialized
+                @events = []
+                @meta = {}
+                @file = ENV["KUMI_PROFILE_FILE"] || "tmp/profile.jsonl"
+                @run_id ||= 1
+                @op_seq ||= 0
+                @aggregated_stats ||= Hash.new { |h, k| h[k] = { count: 0, total_ms: 0.0, total_cpu_ms: 0.0, rows: 0, runs: Set.new } }
+                # Truncate file if needed
+                if ENV["KUMI_PROFILE_TRUNCATE"] == "1" && !@persistent_initialized
+                  FileUtils.mkdir_p(File.dirname(@file))
+                  File.write(@file, "")
+                  @aggregated_stats.clear
+                  @persistent_initialized = true
+                end
+                @initialized = true
+              end
+            end
+            def reset!(meta: {})
+              set_schema_name(meta[:schema_name]) if meta[:schema_name]
+              return unless enabled?
+              # In persistent mode, don't reset aggregated stats or increment run_id
+              # This allows profiling across multiple schema creations
+              if persistent?
+                @events = []
+                @meta = (@meta || {}).merge(meta)
+                @schema_name = meta[:schema_name] if meta[:schema_name]
+                @file = ENV["KUMI_PROFILE_FILE"] || "tmp/profile.jsonl"
+                @run_id ||= 1
+                @op_seq ||= 0
+                @aggregated_stats ||= Hash.new { |h, k| h[k] = { count: 0, total_ms: 0.0, total_cpu_ms: 0.0, rows: 0, runs: Set.new } }
+                # Only truncate on very first reset in persistent mode
+                if ENV["KUMI_PROFILE_TRUNCATE"] == "1" && !@persistent_initialized
+                  FileUtils.mkdir_p(File.dirname(@file))
+                  File.write(@file, "")
+                  @aggregated_stats.clear
+                  @persistent_initialized = true
+                end
+              else
+                # Original behavior: full reset each time
+                @events = []
+                @meta   = meta
+                @schema_name = meta[:schema_name]
+                @file   = ENV["KUMI_PROFILE_FILE"] || "tmp/profile.jsonl"
+                @run_id = (@run_id || 0) + 1
+                @op_seq = 0
+                @aggregated_stats = (@aggregated_stats || Hash.new { |h, k| h[k] = { count: 0, total_ms: 0.0, total_cpu_ms: 0.0, rows: 0, runs: Set.new } })
+                if ENV["KUMI_PROFILE_TRUNCATE"] == "1"
+                  FileUtils.mkdir_p(File.dirname(@file))
+                  File.write(@file, "")
+                  @aggregated_stats.clear
+                end
+              end
+            end
+            # monotonic start time
+            def t0
+              Process.clock_gettime(Process::CLOCK_MONOTONIC)
+            end
+            # CPU time start (process + thread)
+            def cpu_t0
+              Process.clock_gettime(Process::CLOCK_PROCESS_CPUTIME_ID)
+            end
+            # Phase timing for coarse-grained operations
+            def phase(name, tags = {})
+              return yield unless enabled?
+              p0 = t0; c0 = cpu_t0
+              result = yield
+              wall_ms = (t0 - p0) * 1000.0
+              cpu_ms = (cpu_t0 - c0) * 1000.0
+              stream({
+                ts: Time.now.utc.iso8601(3),
+                kind: "phase",
+                name: name,
+                wall_ms: wall_ms.round(3),
+                cpu_ms: cpu_ms.round(3),
+                tags: tags,
+                run: @run_id
+              })
+              result
+            end
+            # Memory snapshot with GC statistics
+            def memory_snapshot(label, extra: {})
+              return unless enabled?
+              s = GC.stat
+              stream({
+                ts: Time.now.utc.iso8601(3),
+                kind: "mem",
+                label: label,
+                heap_live: s[:heap_live_slots],
+                old_objects: s[:old_objects],
+                minor_gc: s[:minor_gc_count],
+                major_gc: s[:major_gc_count],
+                rss_mb: read_rss_mb,
+                run: @run_id,
+                **extra
+              })
+            end
+            def read_rss_mb
+              ((File.read("/proc/#{$$}/status")[/VmRSS:\s+(\d+)\skB/, 1].to_i) / 1024.0).round(2)
+            rescue
+              nil
+            end
+            # Per-op record with both wall time and CPU time (with sampling support)
+            def record!(decl:, idx:, tag:, op:, t0:, cpu_t0: nil, rows: nil, note: nil)
+              return unless enabled? && ops_enabled?
+              @op_seq += 1
+              return unless sample_rate <= 1 || (@op_seq % sample_rate).zero?
+              wall_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - t0) * 1000.0)
+              cpu_ms = cpu_t0 ? ((Process.clock_gettime(Process::CLOCK_PROCESS_CPUTIME_ID) - cpu_t0) * 1000.0) : wall_ms
+              ev = {
+                ts:     Time.now.utc.iso8601(3),
+                run:    @run_id,
+                schema: @schema_name,  # schema identifier for multi-schema differentiation
+                decl:   decl,     # decl name (string/symbol)
+                i:      idx,      # op index
+                tag:    tag,      # op tag (symbol)
+                wall_ms: wall_ms.round(4),
+                cpu_ms:  cpu_ms.round(4),
+                rows:   rows,
+                note:   note,
+                key:    op_key(decl, idx, tag, op),      # stable key for grep/diff
+                attrs:  compact_attrs(op.attrs)
+              }
+              # Aggregate stats for multi-run averaging
+              op_key = "#{decl}@#{idx}:#{tag}"
+              agg = @aggregated_stats[op_key]
+              agg[:count] += 1
+              agg[:total_ms] += wall_ms
+              agg[:total_cpu_ms] += cpu_ms
+              agg[:rows] += (rows || 0)
+              agg[:runs] << @run_id
+              agg[:decl] = decl
+              agg[:tag] = tag
+              agg[:idx] = idx
+              agg[:note] = note if note
+              (@events ||= []) << ev
+              stream(ev) if ENV["KUMI_PROFILE_STREAM"] == "1"
+              ev
+            end
+            def summary(top: 20)
+              return {} unless enabled?
+              # Current run summary (legacy format)
+              current_agg = Hash.new { |h, k| h[k] = { count: 0, ms: 0.0, rows: 0 } }
+              (@events || []).each do |e|
+                k = [e[:decl], e[:tag]]
+                a = current_agg[k]
+                a[:count] += 1
+                a[:ms]    += (e[:wall_ms] || e[:ms] || 0)
+                a[:rows]  += (e[:rows] || 0)
+              end
+              current_ranked = current_agg.map { |(decl, tag), v|
+                { decl: decl, tag: tag, count: v[:count], ms: v[:ms].round(3), rows: v[:rows],
+                  rps: v[:rows] > 0 ? (v[:rows] / v[:ms]).round(1) : nil }
+              }.sort_by { |h| -h[:ms] }.first(top)
+              { meta: @meta || {}, top: current_ranked,
+                total_ms: ((@events || []).sum { |e| e[:wall_ms] || e[:ms] || 0 }).round(3),
+                op_count: (@events || []).size,
+                run_id: @run_id }
+            end
+            # Multi-run averaged analysis
+            def averaged_analysis(top: 20)
+              return {} unless enabled? && @aggregated_stats&.any?
+              # Convert aggregated stats to averaged metrics
+              averaged = @aggregated_stats.map do |op_key, stats|
+                num_runs = stats[:runs].size
+                avg_wall_ms = stats[:total_ms] / stats[:count]
+                avg_cpu_ms = stats[:total_cpu_ms] / stats[:count]
+                total_wall_ms = stats[:total_ms]
+                total_cpu_ms = stats[:total_cpu_ms]
+                {
+                  op_key: op_key,
+                  decl: stats[:decl],
+                  idx: stats[:idx],
+                  tag: stats[:tag],
+                  runs: num_runs,
+                  total_calls: stats[:count],
+                  calls_per_run: stats[:count] / num_runs.to_f,
+                  avg_wall_ms: avg_wall_ms.round(4),
+                  avg_cpu_ms: avg_cpu_ms.round(4),
+                  total_wall_ms: total_wall_ms.round(3),
+                  total_cpu_ms: total_cpu_ms.round(3),
+                  cpu_efficiency: total_wall_ms > 0 ? (total_cpu_ms / total_wall_ms * 100).round(1) : 100,
+                  rows_total: stats[:rows],
+                  note: stats[:note]
+                }
+              end.sort_by { |s| -s[:total_wall_ms] }.first(top)
+              {
+                meta: @meta || {},
+                total_runs: (@aggregated_stats.values.map { |s| s[:runs].size }.max || 0),
+                averaged_ops: averaged,
+                total_operations: @aggregated_stats.size
+              }
+            end
+            # Identify potential cache overhead operations
+            def cache_overhead_analysis
+              return {} unless enabled? && @aggregated_stats&.any?
+              # Look for operations that might be cache-related
+              cache_ops = @aggregated_stats.select do |op_key, stats|
+                op_key.include?("ref") || op_key.include?("load_input") || stats[:note]&.include?("cache")
+              end
+              cache_analysis = cache_ops.map do |op_key, stats|
+                num_runs = stats[:runs].size
+                avg_wall_ms = stats[:total_ms] / stats[:count]
+                {
+                  op_key: op_key,
+                  decl: stats[:decl],
+                  tag: stats[:tag],
+                  avg_time_ms: avg_wall_ms.round(4),
+                  total_time_ms: stats[:total_ms].round(3),
+                  call_count: stats[:count],
+                  overhead_per_call: avg_wall_ms.round(6)
+                }
+              end.sort_by { |s| -s[:total_time_ms] }
+              {
+                cache_operations: cache_analysis,
+                total_cache_time: cache_analysis.sum { |op| op[:total_time_ms] }.round(3)
+              }
+            end
+            def emit_summary!
+              return unless enabled?
+              stream({ ts: Time.now.utc.iso8601(3), kind: "summary", data: summary })
+            end
+            def init_persistent!
+              return unless enabled? && persistent?
+              @persistent_initialized = false
+              reset!
+            end
+            def finalize!
+              return unless enabled?
+              # Emit final aggregated summary
+              if @aggregated_stats&.any?
+                stream({
+                  ts: Time.now.utc.iso8601(3),
+                  kind: "final_summary",
+                  data: averaged_analysis
+                })
+              end
+              # Emit cache analysis if available
+              cache_analysis = cache_overhead_analysis
+              if cache_analysis[:cache_operations]&.any?
+                stream({
+                  ts: Time.now.utc.iso8601(3),
+                  kind: "cache_analysis",
+                  data: cache_analysis
+                })
+              end
+            end
+            # Stable textual key for "match ops one by one"
+            def op_key(decl, idx, tag, op)
+              attrs = compact_attrs(op.attrs)
+              args  = op.args
+              "#{decl}@#{idx}:#{tag}|#{attrs.keys.sort_by(&:to_s).map { |k| "#{k}=#{attrs[k].inspect}" }.join(",")}|args=#{args.inspect}"
+            end
+            def compact_attrs(h)
+              return {} unless h
+              h.transform_values do |v|
+                case v
+                when Array, Hash, Symbol, String, Numeric, TrueClass, FalseClass, NilClass then v
+                else v.to_s
+                end
+              end
+            end
+            def stream(obj)
+              return unless @file
+              FileUtils.mkdir_p(File.dirname(@file))
+              File.open(@file, "a") { |f| f.puts(obj.to_json) }
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/kumi/core/ir/execution_engine.rb CHANGED Viewed

@@ -43,15 +43,15 @@ module Kumi
       module ExecutionEngine
         def self.run(ir_module, ctx, accessors:, registry:)
           # Use persistent accessor cache if available, otherwise create temporary one
-          if ctx[:accessor_cache]
+          memoized_accessors = Dev::Profiler.phase("engine.memoization") do
             # Include input data in cache key to avoid cross-context pollution
             input_key = ctx[:input]&.hash || ctx["input"]&.hash || 0
-            memoized_accessors = add_persistent_memoization(accessors, ctx[:accessor_cache], input_key)
-          else
-            memoized_accessors = add_temporary_memoization(accessors)
+            add_persistent_memoization(accessors, ctx[:accessor_cache], input_key)
+          end
+          Dev::Profiler.phase("engine.interpreter") do
+            Interpreter.run(ir_module, ctx, accessors: memoized_accessors, registry: registry)
           end
-          Interpreter.run(ir_module, ctx, accessors: memoized_accessors, registry: registry)
         end
         private
@@ -64,15 +64,6 @@ module Kumi
             end]
           end.to_h
         end
-        def self.add_temporary_memoization(accessors)
-          cache = {}
-          accessors.map do |plan_id, accessor_fn|
-            [plan_id, lambda do |input_data|
-              cache[plan_id] ||= accessor_fn.call(input_data)
-            end]
-          end.to_h
-        end
       end
     end
   end

data/lib/kumi/dev/ir.rb ADDED Viewed

@@ -0,0 +1,75 @@
+# frozen_string_literal: true
+require "json"
+module Kumi
+  module Dev
+    module IR
+      module_function
+      def to_text(ir_module)
+        raise "nil IR" unless ir_module
+        lines = []
+        lines << "IR Module"
+        lines << "decls: #{ir_module.decls.size}"
+        ir_module.decls.each_with_index do |decl, i|
+          lines << "decl[#{i}] #{decl.kind}:#{decl.name} shape=#{decl.shape} ops=#{decl.ops.size}"
+          decl.ops.each_with_index do |op, j|
+            # Sort attribute keys for deterministic output
+            sorted_attrs = op.attrs.keys.sort.map { |k| "#{k}=#{format_value(op.attrs[k])}" }.join(" ")
+            args_str = op.args.inspect
+            lines << "  #{j}: #{op.tag} #{sorted_attrs} #{args_str}".rstrip
+          end
+        end
+        lines.join("\n") + "\n"
+      end
+      private
+      def self.format_value(val)
+        case val
+        when true, false
+          val.to_s
+        when Symbol
+          ":#{val}"
+        when Array
+          val.inspect
+        else
+          val.to_s
+        end
+      end
+      def to_json(ir_module, pretty: true)
+        raise "nil IR" unless ir_module
+        data = {
+          inputs: ir_module.inputs,
+          decls: ir_module.decls.map do |decl|
+            {
+              name: decl.name,
+              kind: decl.kind,
+              shape: decl.shape,
+              ops: decl.ops.map do |op|
+                {
+                  tag: op.tag,
+                  attrs: op.attrs,
+                  args: op.args
+                }
+              end
+            }
+          end
+        }
+        if pretty
+          JSON.pretty_generate(data)
+        else
+          JSON.generate(data)
+        end
+      end
+    end
+  end
+end

data/lib/kumi/dev/parse.rb ADDED Viewed

@@ -0,0 +1,105 @@
+# frozen_string_literal: true
+require "fileutils"
+module Kumi
+  module Dev
+    module Parse
+      module_function
+      def run(schema_path, opts = {})
+        # Load schema via text frontend
+        begin
+          schema, _inputs = Kumi::Frontends::Text.load(path: schema_path)
+        rescue LoadError => e
+          puts "Error: kumi-parser gem not available. Install: gem install kumi-parser"
+          return false
+        rescue StandardError => e
+          puts "Parse error: #{e.message}"
+          return false
+        end
+        # Run analyzer
+        runner_opts = opts.slice(:trace, :snap, :snap_dir, :resume_from, :resume_at, :stop_after)
+        res = Dev::Runner.run(schema, runner_opts)
+        unless res.ok?
+          puts "Analysis errors:"
+          res.errors.each { |err| puts "  #{err}" }
+          return false
+        end
+        unless res.ir
+          puts "Error: No IR generated"
+          return false
+        end
+        # Report trace file if enabled
+        if opts[:trace] && res.respond_to?(:trace_file)
+          puts "Trace written to: #{res.trace_file}"
+        end
+        # Determine file extension and renderer
+        extension = opts[:json] ? "json" : "txt"
+        golden_path = File.join(File.dirname(schema_path), "expected", "ir.#{extension}")
+        # Render IR
+        rendered = if opts[:json]
+          Dev::IR.to_json(res.ir, pretty: true)
+        else
+          Dev::IR.to_text(res.ir)
+        end
+        # Handle write mode
+        if opts[:write]
+          FileUtils.mkdir_p(File.dirname(golden_path))
+          File.write(golden_path, rendered)
+          puts "Wrote: #{golden_path}"
+          return true
+        end
+        # Handle update mode (write only if different)
+        if opts[:update]
+          if File.exist?(golden_path) && File.read(golden_path) == rendered
+            puts "No changes (#{golden_path})"
+            return true
+          else
+            FileUtils.mkdir_p(File.dirname(golden_path))
+            File.write(golden_path, rendered)
+            puts "Updated: #{golden_path}"
+            return true
+          end
+        end
+        # Handle no-diff mode
+        if opts[:no_diff]
+          puts rendered
+          return true
+        end
+        # Default: diff mode (same as write but show diff instead)
+        if File.exist?(golden_path)
+          # Use diff directly with the golden file path
+          require "tempfile"
+          Tempfile.create(["actual", File.extname(golden_path)]) do |actual_file|
+            actual_file.write(rendered)
+            actual_file.flush
+            result = `diff -u --label=expected --label=actual #{golden_path} #{actual_file.path}`
+            if result.empty?
+              puts "No changes (#{golden_path})"
+              return true
+            else
+              puts result.chomp
+              return false
+            end
+          end
+        else
+          # No golden file exists, just print the output
+          puts rendered
+          return true
+        end
+      end
+    end
+  end
+end