RubyGems - broadlistening - Versions diffs - 0.7.0 - Mend

broadlistening 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

checksums.yaml +7 -0
data/.rspec +3 -0
data/.rubocop.yml +3 -0
data/CHANGELOG.md +40 -0
data/CLAUDE.md +112 -0
data/LICENSE +24 -0
data/LICENSE-AGPLv3.txt +661 -0
data/README.md +195 -0
data/Rakefile +77 -0
data/exe/broadlistening +6 -0
data/lib/broadlistening/argument.rb +136 -0
data/lib/broadlistening/cli.rb +196 -0
data/lib/broadlistening/comment.rb +128 -0
data/lib/broadlistening/compatibility.rb +375 -0
data/lib/broadlistening/config.rb +190 -0
data/lib/broadlistening/context.rb +180 -0
data/lib/broadlistening/csv_loader.rb +109 -0
data/lib/broadlistening/hierarchical_clustering.rb +142 -0
data/lib/broadlistening/kmeans.rb +185 -0
data/lib/broadlistening/llm_client.rb +84 -0
data/lib/broadlistening/pipeline.rb +129 -0
data/lib/broadlistening/planner.rb +114 -0
data/lib/broadlistening/provider.rb +97 -0
data/lib/broadlistening/spec_loader.rb +86 -0
data/lib/broadlistening/status.rb +132 -0
data/lib/broadlistening/steps/aggregation.rb +228 -0
data/lib/broadlistening/steps/base_step.rb +42 -0
data/lib/broadlistening/steps/clustering.rb +103 -0
data/lib/broadlistening/steps/embedding.rb +40 -0
data/lib/broadlistening/steps/extraction.rb +73 -0
data/lib/broadlistening/steps/initial_labelling.rb +85 -0
data/lib/broadlistening/steps/merge_labelling.rb +93 -0
data/lib/broadlistening/steps/overview.rb +36 -0
data/lib/broadlistening/version.rb +5 -0
data/lib/broadlistening.rb +44 -0
data/schema/hierarchical_result.json +152 -0
data/sig/broadlistening.rbs +4 -0
metadata +194 -0

data/lib/broadlistening/pipeline.rb ADDED Viewed

@@ -0,0 +1,129 @@
+# frozen_string_literal: true
+require "pathname"
+module Broadlistening
+  # Orchestrates the execution of the broadlistening pipeline.
+  #
+  # The Pipeline is responsible for:
+  # - Coordinating step execution order
+  # - Managing execution status and locking
+  # - Handling incremental execution (skip unchanged steps)
+  # - Emitting instrumentation events
+  #
+  # @example Basic usage
+  #   pipeline = Pipeline.new(api_key: "...", cluster_nums: [5, 15])
+  #   result = pipeline.run(comments, output_dir: "/path/to/output")
+  #
+  # @example Force re-run all steps
+  #   pipeline.run(comments, output_dir: "/path/to/output", force: true)
+  #
+  # @example Run only a specific step
+  #   pipeline.run(comments, output_dir: "/path/to/output", only: :clustering)
+  class Pipeline
+    attr_reader :config, :spec_loader
+    def initialize(config, spec_loader: nil)
+      @config = config.is_a?(Config) ? config : Config.new(config)
+      @spec_loader = spec_loader || SpecLoader.default
+    end
+    # Run the pipeline with incremental execution support
+    #
+    # @param comments [Array] Array of comments to process
+    # @param output_dir [String] Directory for output files and status tracking
+    # @param force [Boolean] Force re-run all steps
+    # @param only [Symbol, nil] Run only the specified step
+    # @return [Hash] The result of the pipeline
+    def run(comments, output_dir:, force: false, only: nil)
+      output_path = Pathname.new(output_dir)
+      status = Status.new(output_path)
+      raise Error, "Pipeline is locked. Another process may be running." if status.locked?
+      context = Context.load_from_dir(output_path)
+      context.output_dir = output_path
+      # Normalize comments if not already loaded
+      context.comments = normalize_comments(comments) if context.comments.empty?
+      planner = Planner.new(
+        config: @config,
+        status: status,
+        output_dir: output_path,
+        spec_loader: @spec_loader
+      )
+      plan = planner.create_plan(force: force, only: only)
+      status.start_pipeline(plan)
+      execute_pipeline(plan, status, planner, context, output_path)
+      status.complete_pipeline
+      context.result
+    rescue StandardError => e
+      status&.error_pipeline(e)
+      raise
+    end
+    private
+    def execute_pipeline(plan, status, planner, context, output_path)
+      instrument("pipeline.broadlistening", comment_count: context.comments.size) do
+        plan.each_with_index do |step_plan, index|
+          if step_plan[:run]
+            execute_step(step_plan[:step], index, status, planner, context, output_path)
+          else
+            notify_skip(step_plan[:step], step_plan[:reason])
+          end
+        end
+      end
+    end
+    def execute_step(step_name, index, status, planner, context, output_path)
+      status.start_step(step_name)
+      start_time = Time.now
+      steps = @spec_loader.steps
+      payload = { step: step_name, step_index: index, step_total: steps.size }
+      instrument("step.broadlistening", payload) do
+        step = step_class(step_name).new(@config, context)
+        step.execute
+      end
+      duration = Time.now - start_time
+      params = planner.extract_current_params(step_name)
+      status.complete_step(step_name, params: params, duration: duration)
+      context.save_step(step_name, output_path)
+    end
+    def normalize_comments(comments)
+      comments.map do |comment|
+        if comment.is_a?(Comment)
+          comment
+        elsif comment.is_a?(Hash)
+          Comment.from_hash(comment, property_names: @config.property_names)
+        else
+          Comment.from_object(comment, property_names: @config.property_names)
+        end
+      end
+    end
+    def notify_skip(step_name, reason)
+      ActiveSupport::Notifications.instrument("step.skip.broadlistening", {
+                                                step: step_name,
+                                                reason: reason
+                                              })
+    end
+    def instrument(event_name, payload = {}, &block)
+      ActiveSupport::Notifications.instrument(event_name, payload, &block)
+    end
+    def step_class(name)
+      Broadlistening::Steps.const_get(name.to_s.camelize)
+    end
+  end
+end

data/lib/broadlistening/planner.rb ADDED Viewed

@@ -0,0 +1,114 @@
+# frozen_string_literal: true
+require "pathname"
+require "digest"
+module Broadlistening
+  class Planner
+    attr_reader :spec_loader, :config, :status, :output_dir
+    def initialize(config:, status:, output_dir:, spec_loader: nil)
+      @config = config
+      @status = status
+      @output_dir = Pathname.new(output_dir)
+      @spec_loader = spec_loader || SpecLoader.default
+      @previous_jobs = status.previous_completed_jobs
+    end
+    def create_plan(force: false, only: nil)
+      plan = []
+      spec_loader.specs.each do |spec|
+        step_name = spec[:step]
+        run, reason = decide_step(spec, plan, force: force, only: only)
+        plan << { step: step_name, run: run, reason: reason }
+      end
+      plan
+    end
+    def extract_current_params(step_name)
+      case step_name.to_sym
+      when :extraction
+        { model: config.model, prompt: config.prompts[:extraction] }
+      when :embedding
+        { model: config.embedding_model }
+      when :clustering
+        { cluster_nums: config.cluster_nums }
+      when :initial_labelling
+        { model: config.model, prompt: config.prompts[:initial_labelling] }
+      when :merge_labelling
+        { model: config.model, prompt: config.prompts[:merge_labelling] }
+      when :overview
+        { model: config.model, prompt: config.prompts[:overview] }
+      when :aggregation
+        {}
+      else
+        {}
+      end
+    end
+    private
+    def decide_step(spec, plan, force:, only:)
+      step_name = spec[:step]
+      # 強制実行
+      return [ true, "forced with -f" ] if force
+      # 特定ステップのみ実行
+      if only
+        return [ true, "forced this step with -o" ] if only.to_sym == step_name
+        return [ false, "forced another step with -o" ]
+      end
+      # 前回実行記録の確認
+      prev_job = find_previous_job(step_name)
+      return [ true, "no trace of previous run" ] unless prev_job
+      # 出力ファイルの存在確認
+      output_file = output_dir / spec[:output_file]
+      return [ true, "previous output not found" ] unless output_file.exist?
+      # 依存ステップの確認
+      deps = spec[:dependencies][:steps]
+      changing_deps = plan.select { |p| deps.include?(p[:step]) && p[:run] }
+      if changing_deps.any?
+        dep_names = changing_deps.map { |d| d[:step] }.join(", ")
+        return [ true, "dependent steps will re-run: #{dep_names}" ]
+      end
+      # パラメータ変更の確認
+      changed_params = detect_param_changes(spec, prev_job)
+      return [ true, "parameters changed: #{changed_params.join(', ')}" ] if changed_params.any?
+      # 変更なし - スキップ
+      [ false, "nothing changed" ]
+    end
+    def find_previous_job(step_name)
+      @previous_jobs.find { |j| j[:step] == step_name.to_s }
+    end
+    def detect_param_changes(spec, prev_job)
+      params_to_check = spec[:dependencies][:params]
+      prev_params = prev_job[:params] || {}
+      current_params = extract_current_params(spec[:step])
+      params_to_check.select do |param|
+        current_value = current_params[param]
+        # prev_paramsのキーは文字列の場合もある
+        prev_value = prev_params[param.to_s] || prev_params[param]
+        # プロンプトなど長い文字列はハッシュ化して比較
+        if current_value.is_a?(String) && current_value.length > 100
+          Digest::SHA256.hexdigest(current_value) != prev_value
+        else
+          current_value != prev_value
+        end
+      end
+    end
+  end
+end

data/lib/broadlistening/provider.rb ADDED Viewed

@@ -0,0 +1,97 @@
+# frozen_string_literal: true
+module Broadlistening
+  class Provider
+    PROVIDERS = {
+      openai: {
+        api_key_env: "OPENAI_API_KEY",
+        model: "gpt-4o-mini",
+        embedding_model: "text-embedding-3-small"
+      },
+      azure: {
+        api_key_env: "AZURE_OPENAI_API_KEY",
+        base_url_env: "AZURE_OPENAI_URI",
+        model: "gpt-4o-mini",
+        embedding_model: "text-embedding-3-small"
+      },
+      gemini: {
+        api_key_env: "GEMINI_API_KEY",
+        base_url: "https://generativelanguage.googleapis.com/v1beta/openai/",
+        model: "gemini-2.0-flash",
+        embedding_model: "text-embedding-004"
+      },
+      openrouter: {
+        api_key_env: "OPENROUTER_API_KEY",
+        base_url: "https://openrouter.ai/api/v1",
+        model: "gpt-4o-mini",
+        embedding_model: "text-embedding-3-small"
+      },
+      local: {
+        api_key: "not-needed",
+        model: "gpt-4o-mini",
+        embedding_model: "text-embedding-3-small"
+      }
+    }.freeze
+    attr_reader :name
+    def self.supported?(name)
+      PROVIDERS.key?(name)
+    end
+    def self.supported_names
+      PROVIDERS.keys
+    end
+    def initialize(name, local_llm_address: nil)
+      @name = name
+      @local_llm_address = local_llm_address || "localhost:11434"
+      @config = PROVIDERS.fetch(name) { raise ConfigurationError, "Unknown provider: #{name}" }
+    end
+    def api_key
+      @config[:api_key] || ENV.fetch(@config[:api_key_env], nil)
+    end
+    def base_url
+      return "http://#{@local_llm_address}/v1" if @name == :local
+      @config[:base_url] || (@config[:base_url_env] && ENV.fetch(@config[:base_url_env], nil))
+    end
+    def default_model
+      @config[:model]
+    end
+    def default_embedding_model
+      @config[:embedding_model]
+    end
+    def requires_api_key?
+      @name != :local
+    end
+    def requires_base_url?
+      @name == :azure
+    end
+    def azure?
+      @name == :azure
+    end
+    def build_openai_client(api_key:, base_url:, azure_api_version: nil)
+      if azure?
+        OpenAI::Client.new(
+          access_token: api_key,
+          uri_base: base_url,
+          api_type: :azure,
+          api_version: azure_api_version
+        )
+      elsif base_url
+        OpenAI::Client.new(access_token: api_key, uri_base: base_url)
+      else
+        OpenAI::Client.new(access_token: api_key)
+      end
+    end
+  end
+end

data/lib/broadlistening/spec_loader.rb ADDED Viewed

@@ -0,0 +1,86 @@
+# frozen_string_literal: true
+require "json"
+module Broadlistening
+  class SpecLoader
+    # Python版のステップ名をRuby gem用に変換するマッピング
+    STEP_MAPPING = {
+      "extraction" => :extraction,
+      "embedding" => :embedding,
+      "hierarchical_clustering" => :clustering,
+      "hierarchical_initial_labelling" => :initial_labelling,
+      "hierarchical_merge_labelling" => :merge_labelling,
+      "hierarchical_overview" => :overview,
+      "hierarchical_aggregation" => :aggregation,
+      "hierarchical_visualization" => nil # スキップ（gem責務外）
+    }.freeze
+    # Ruby gem独自の中間ファイル名
+    OUTPUT_FILES = {
+      extraction: "extraction.json",
+      embedding: "embeddings.json",
+      clustering: "clustering.json",
+      initial_labelling: "initial_labels.json",
+      merge_labelling: "merge_labels.json",
+      overview: "overview.json",
+      aggregation: "result.json"
+    }.freeze
+    attr_reader :specs
+    def initialize(specs_path)
+      raw_specs = JSON.parse(File.read(specs_path), symbolize_names: true)
+      @specs = convert_specs(raw_specs)
+    end
+    def self.default
+      new(default_specs_path)
+    end
+    def self.default_specs_path
+      ENV.fetch("BROADLISTENING_SPECS_PATH") do
+        File.expand_path("../../../../../server/broadlistening/pipeline/hierarchical_specs.json", __dir__)
+      end
+    end
+    def find(step_name)
+      @specs.find { |s| s[:step] == step_name.to_sym }
+    end
+    def steps
+      @specs.map { |s| s[:step] }
+    end
+    private
+    def convert_specs(raw_specs)
+      raw_specs.filter_map do |spec|
+        ruby_step = STEP_MAPPING[spec[:step]]
+        next if ruby_step.nil? # hierarchical_visualization等をスキップ
+        {
+          step: ruby_step,
+          output_file: OUTPUT_FILES[ruby_step],
+          dependencies: convert_dependencies(spec),
+          use_llm: spec[:use_llm] || false
+        }
+      end
+    end
+    def convert_dependencies(spec)
+      deps = spec[:dependencies] || {}
+      params = (deps[:params] || []).map(&:to_sym)
+      # use_llm が true の場合、prompt と model を自動追加
+      if spec[:use_llm]
+        params << :prompt unless params.include?(:prompt)
+        params << :model unless params.include?(:model)
+      end
+      steps = (deps[:steps] || []).filter_map { |s| STEP_MAPPING[s] }
+      { params: params.uniq, steps: steps }
+    end
+  end
+end

data/lib/broadlistening/status.rb ADDED Viewed

@@ -0,0 +1,132 @@
+# frozen_string_literal: true
+require "json"
+require "fileutils"
+require "pathname"
+require "time"
+require "digest"
+module Broadlistening
+  class Status
+    LOCK_DURATION = 300 # 5分
+    attr_reader :output_dir, :status_file, :data
+    def initialize(output_dir)
+      @output_dir = Pathname.new(output_dir)
+      @status_file = @output_dir / "status.json"
+      @data = load_or_initialize
+    end
+    def load_or_initialize
+      if status_file.exist?
+        JSON.parse(status_file.read, symbolize_names: true)
+      else
+        {
+          status: "initialized",
+          completed_jobs: [],
+          previously_completed_jobs: []
+        }
+      end
+    end
+    def save
+      FileUtils.mkdir_p(output_dir)
+      status_file.write(JSON.pretty_generate(@data))
+    end
+    def start_pipeline(plan)
+      @data.merge!(
+        status: "running",
+        plan: plan.map { |p| serialize_plan_entry(p) },
+        start_time: Time.now.iso8601,
+        completed_jobs: [],
+        lock_until: lock_time.iso8601
+      )
+      save
+    end
+    def start_step(step_name)
+      @data[:current_job] = step_name.to_s
+      @data[:current_job_started] = Time.now.iso8601
+      @data[:lock_until] = lock_time.iso8601
+      save
+    end
+    def complete_step(step_name, params:, duration:, token_usage: 0)
+      @data[:completed_jobs] ||= []
+      @data[:completed_jobs] << {
+        step: step_name.to_s,
+        completed: Time.now.iso8601,
+        duration: duration,
+        params: serialize_params(params),
+        token_usage: token_usage
+      }
+      @data.delete(:current_job)
+      @data.delete(:current_job_started)
+      save
+    end
+    def complete_pipeline
+      merge_previous_jobs
+      @data[:status] = "completed"
+      @data[:end_time] = Time.now.iso8601
+      @data.delete(:previous)
+      save
+    end
+    def error_pipeline(error)
+      @data[:status] = "error"
+      @data[:end_time] = Time.now.iso8601
+      @data[:error] = "#{error.class}: #{error.message}"
+      @data[:error_stack_trace] = error.backtrace&.join("\n")
+      save
+    end
+    def locked?
+      return false unless @data[:status] == "running"
+      return false unless @data[:lock_until]
+      Time.parse(@data[:lock_until]) > Time.now
+    end
+    def previous_completed_jobs
+      (@data[:completed_jobs] || []) + (@data[:previously_completed_jobs] || [])
+    end
+    private
+    def lock_time
+      Time.now + LOCK_DURATION
+    end
+    def serialize_plan_entry(entry)
+      {
+        step: entry[:step].to_s,
+        run: entry[:run],
+        reason: entry[:reason]
+      }
+    end
+    def serialize_params(params)
+      params.transform_values do |v|
+        # プロンプトなど長い文字列はハッシュ化して保存（サイズ削減・比較用）
+        if v.is_a?(String) && v.length > 100
+          Digest::SHA256.hexdigest(v)
+        else
+          v
+        end
+      end
+    end
+    def merge_previous_jobs
+      return unless @data[:previous]
+      old_jobs = @data[:previous][:completed_jobs] || []
+      old_jobs += @data[:previous][:previously_completed_jobs] || []
+      newly_completed = @data[:completed_jobs].map { |j| j[:step] }
+      @data[:previously_completed_jobs] = old_jobs.reject { |j| newly_completed.include?(j[:step]) }
+    end
+  end
+end