broadlistening 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/CHANGELOG.md +40 -0
  5. data/CLAUDE.md +112 -0
  6. data/LICENSE +24 -0
  7. data/LICENSE-AGPLv3.txt +661 -0
  8. data/README.md +195 -0
  9. data/Rakefile +77 -0
  10. data/exe/broadlistening +6 -0
  11. data/lib/broadlistening/argument.rb +136 -0
  12. data/lib/broadlistening/cli.rb +196 -0
  13. data/lib/broadlistening/comment.rb +128 -0
  14. data/lib/broadlistening/compatibility.rb +375 -0
  15. data/lib/broadlistening/config.rb +190 -0
  16. data/lib/broadlistening/context.rb +180 -0
  17. data/lib/broadlistening/csv_loader.rb +109 -0
  18. data/lib/broadlistening/hierarchical_clustering.rb +142 -0
  19. data/lib/broadlistening/kmeans.rb +185 -0
  20. data/lib/broadlistening/llm_client.rb +84 -0
  21. data/lib/broadlistening/pipeline.rb +129 -0
  22. data/lib/broadlistening/planner.rb +114 -0
  23. data/lib/broadlistening/provider.rb +97 -0
  24. data/lib/broadlistening/spec_loader.rb +86 -0
  25. data/lib/broadlistening/status.rb +132 -0
  26. data/lib/broadlistening/steps/aggregation.rb +228 -0
  27. data/lib/broadlistening/steps/base_step.rb +42 -0
  28. data/lib/broadlistening/steps/clustering.rb +103 -0
  29. data/lib/broadlistening/steps/embedding.rb +40 -0
  30. data/lib/broadlistening/steps/extraction.rb +73 -0
  31. data/lib/broadlistening/steps/initial_labelling.rb +85 -0
  32. data/lib/broadlistening/steps/merge_labelling.rb +93 -0
  33. data/lib/broadlistening/steps/overview.rb +36 -0
  34. data/lib/broadlistening/version.rb +5 -0
  35. data/lib/broadlistening.rb +44 -0
  36. data/schema/hierarchical_result.json +152 -0
  37. data/sig/broadlistening.rbs +4 -0
  38. metadata +194 -0
@@ -0,0 +1,129 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+
5
+ module Broadlistening
6
+ # Orchestrates the execution of the broadlistening pipeline.
7
+ #
8
+ # The Pipeline is responsible for:
9
+ # - Coordinating step execution order
10
+ # - Managing execution status and locking
11
+ # - Handling incremental execution (skip unchanged steps)
12
+ # - Emitting instrumentation events
13
+ #
14
+ # @example Basic usage
15
+ # pipeline = Pipeline.new(api_key: "...", cluster_nums: [5, 15])
16
+ # result = pipeline.run(comments, output_dir: "/path/to/output")
17
+ #
18
+ # @example Force re-run all steps
19
+ # pipeline.run(comments, output_dir: "/path/to/output", force: true)
20
+ #
21
+ # @example Run only a specific step
22
+ # pipeline.run(comments, output_dir: "/path/to/output", only: :clustering)
23
+ class Pipeline
24
+ attr_reader :config, :spec_loader
25
+
26
+ def initialize(config, spec_loader: nil)
27
+ @config = config.is_a?(Config) ? config : Config.new(config)
28
+ @spec_loader = spec_loader || SpecLoader.default
29
+ end
30
+
31
+ # Run the pipeline with incremental execution support
32
+ #
33
+ # @param comments [Array] Array of comments to process
34
+ # @param output_dir [String] Directory for output files and status tracking
35
+ # @param force [Boolean] Force re-run all steps
36
+ # @param only [Symbol, nil] Run only the specified step
37
+ # @return [Hash] The result of the pipeline
38
+ def run(comments, output_dir:, force: false, only: nil)
39
+ output_path = Pathname.new(output_dir)
40
+ status = Status.new(output_path)
41
+
42
+ raise Error, "Pipeline is locked. Another process may be running." if status.locked?
43
+
44
+ context = Context.load_from_dir(output_path)
45
+ context.output_dir = output_path
46
+
47
+ # Normalize comments if not already loaded
48
+ context.comments = normalize_comments(comments) if context.comments.empty?
49
+
50
+ planner = Planner.new(
51
+ config: @config,
52
+ status: status,
53
+ output_dir: output_path,
54
+ spec_loader: @spec_loader
55
+ )
56
+ plan = planner.create_plan(force: force, only: only)
57
+
58
+ status.start_pipeline(plan)
59
+
60
+ execute_pipeline(plan, status, planner, context, output_path)
61
+
62
+ status.complete_pipeline
63
+ context.result
64
+ rescue StandardError => e
65
+ status&.error_pipeline(e)
66
+ raise
67
+ end
68
+
69
+ private
70
+
71
+ def execute_pipeline(plan, status, planner, context, output_path)
72
+ instrument("pipeline.broadlistening", comment_count: context.comments.size) do
73
+ plan.each_with_index do |step_plan, index|
74
+ if step_plan[:run]
75
+ execute_step(step_plan[:step], index, status, planner, context, output_path)
76
+ else
77
+ notify_skip(step_plan[:step], step_plan[:reason])
78
+ end
79
+ end
80
+ end
81
+ end
82
+
83
+ def execute_step(step_name, index, status, planner, context, output_path)
84
+ status.start_step(step_name)
85
+ start_time = Time.now
86
+
87
+ steps = @spec_loader.steps
88
+ payload = { step: step_name, step_index: index, step_total: steps.size }
89
+
90
+ instrument("step.broadlistening", payload) do
91
+ step = step_class(step_name).new(@config, context)
92
+ step.execute
93
+ end
94
+
95
+ duration = Time.now - start_time
96
+ params = planner.extract_current_params(step_name)
97
+ status.complete_step(step_name, params: params, duration: duration)
98
+
99
+ context.save_step(step_name, output_path)
100
+ end
101
+
102
+ def normalize_comments(comments)
103
+ comments.map do |comment|
104
+ if comment.is_a?(Comment)
105
+ comment
106
+ elsif comment.is_a?(Hash)
107
+ Comment.from_hash(comment, property_names: @config.property_names)
108
+ else
109
+ Comment.from_object(comment, property_names: @config.property_names)
110
+ end
111
+ end
112
+ end
113
+
114
+ def notify_skip(step_name, reason)
115
+ ActiveSupport::Notifications.instrument("step.skip.broadlistening", {
116
+ step: step_name,
117
+ reason: reason
118
+ })
119
+ end
120
+
121
+ def instrument(event_name, payload = {}, &block)
122
+ ActiveSupport::Notifications.instrument(event_name, payload, &block)
123
+ end
124
+
125
+ def step_class(name)
126
+ Broadlistening::Steps.const_get(name.to_s.camelize)
127
+ end
128
+ end
129
+ end
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pathname"
4
+ require "digest"
5
+
6
+ module Broadlistening
7
+ class Planner
8
+ attr_reader :spec_loader, :config, :status, :output_dir
9
+
10
+ def initialize(config:, status:, output_dir:, spec_loader: nil)
11
+ @config = config
12
+ @status = status
13
+ @output_dir = Pathname.new(output_dir)
14
+ @spec_loader = spec_loader || SpecLoader.default
15
+ @previous_jobs = status.previous_completed_jobs
16
+ end
17
+
18
+ def create_plan(force: false, only: nil)
19
+ plan = []
20
+
21
+ spec_loader.specs.each do |spec|
22
+ step_name = spec[:step]
23
+ run, reason = decide_step(spec, plan, force: force, only: only)
24
+ plan << { step: step_name, run: run, reason: reason }
25
+ end
26
+
27
+ plan
28
+ end
29
+
30
+ def extract_current_params(step_name)
31
+ case step_name.to_sym
32
+ when :extraction
33
+ { model: config.model, prompt: config.prompts[:extraction] }
34
+ when :embedding
35
+ { model: config.embedding_model }
36
+ when :clustering
37
+ { cluster_nums: config.cluster_nums }
38
+ when :initial_labelling
39
+ { model: config.model, prompt: config.prompts[:initial_labelling] }
40
+ when :merge_labelling
41
+ { model: config.model, prompt: config.prompts[:merge_labelling] }
42
+ when :overview
43
+ { model: config.model, prompt: config.prompts[:overview] }
44
+ when :aggregation
45
+ {}
46
+ else
47
+ {}
48
+ end
49
+ end
50
+
51
+ private
52
+
53
+ def decide_step(spec, plan, force:, only:)
54
+ step_name = spec[:step]
55
+
56
+ # 強制実行
57
+ return [ true, "forced with -f" ] if force
58
+
59
+ # 特定ステップのみ実行
60
+ if only
61
+ return [ true, "forced this step with -o" ] if only.to_sym == step_name
62
+
63
+ return [ false, "forced another step with -o" ]
64
+
65
+ end
66
+
67
+ # 前回実行記録の確認
68
+ prev_job = find_previous_job(step_name)
69
+ return [ true, "no trace of previous run" ] unless prev_job
70
+
71
+ # 出力ファイルの存在確認
72
+ output_file = output_dir / spec[:output_file]
73
+ return [ true, "previous output not found" ] unless output_file.exist?
74
+
75
+ # 依存ステップの確認
76
+ deps = spec[:dependencies][:steps]
77
+ changing_deps = plan.select { |p| deps.include?(p[:step]) && p[:run] }
78
+ if changing_deps.any?
79
+ dep_names = changing_deps.map { |d| d[:step] }.join(", ")
80
+ return [ true, "dependent steps will re-run: #{dep_names}" ]
81
+ end
82
+
83
+ # パラメータ変更の確認
84
+ changed_params = detect_param_changes(spec, prev_job)
85
+ return [ true, "parameters changed: #{changed_params.join(', ')}" ] if changed_params.any?
86
+
87
+ # 変更なし - スキップ
88
+ [ false, "nothing changed" ]
89
+ end
90
+
91
+ def find_previous_job(step_name)
92
+ @previous_jobs.find { |j| j[:step] == step_name.to_s }
93
+ end
94
+
95
+ def detect_param_changes(spec, prev_job)
96
+ params_to_check = spec[:dependencies][:params]
97
+ prev_params = prev_job[:params] || {}
98
+ current_params = extract_current_params(spec[:step])
99
+
100
+ params_to_check.select do |param|
101
+ current_value = current_params[param]
102
+ # prev_paramsのキーは文字列の場合もある
103
+ prev_value = prev_params[param.to_s] || prev_params[param]
104
+
105
+ # プロンプトなど長い文字列はハッシュ化して比較
106
+ if current_value.is_a?(String) && current_value.length > 100
107
+ Digest::SHA256.hexdigest(current_value) != prev_value
108
+ else
109
+ current_value != prev_value
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Broadlistening
4
+ class Provider
5
+ PROVIDERS = {
6
+ openai: {
7
+ api_key_env: "OPENAI_API_KEY",
8
+ model: "gpt-4o-mini",
9
+ embedding_model: "text-embedding-3-small"
10
+ },
11
+ azure: {
12
+ api_key_env: "AZURE_OPENAI_API_KEY",
13
+ base_url_env: "AZURE_OPENAI_URI",
14
+ model: "gpt-4o-mini",
15
+ embedding_model: "text-embedding-3-small"
16
+ },
17
+ gemini: {
18
+ api_key_env: "GEMINI_API_KEY",
19
+ base_url: "https://generativelanguage.googleapis.com/v1beta/openai/",
20
+ model: "gemini-2.0-flash",
21
+ embedding_model: "text-embedding-004"
22
+ },
23
+ openrouter: {
24
+ api_key_env: "OPENROUTER_API_KEY",
25
+ base_url: "https://openrouter.ai/api/v1",
26
+ model: "gpt-4o-mini",
27
+ embedding_model: "text-embedding-3-small"
28
+ },
29
+ local: {
30
+ api_key: "not-needed",
31
+ model: "gpt-4o-mini",
32
+ embedding_model: "text-embedding-3-small"
33
+ }
34
+ }.freeze
35
+
36
+ attr_reader :name
37
+
38
+ def self.supported?(name)
39
+ PROVIDERS.key?(name)
40
+ end
41
+
42
+ def self.supported_names
43
+ PROVIDERS.keys
44
+ end
45
+
46
+ def initialize(name, local_llm_address: nil)
47
+ @name = name
48
+ @local_llm_address = local_llm_address || "localhost:11434"
49
+ @config = PROVIDERS.fetch(name) { raise ConfigurationError, "Unknown provider: #{name}" }
50
+ end
51
+
52
+ def api_key
53
+ @config[:api_key] || ENV.fetch(@config[:api_key_env], nil)
54
+ end
55
+
56
+ def base_url
57
+ return "http://#{@local_llm_address}/v1" if @name == :local
58
+
59
+ @config[:base_url] || (@config[:base_url_env] && ENV.fetch(@config[:base_url_env], nil))
60
+ end
61
+
62
+ def default_model
63
+ @config[:model]
64
+ end
65
+
66
+ def default_embedding_model
67
+ @config[:embedding_model]
68
+ end
69
+
70
+ def requires_api_key?
71
+ @name != :local
72
+ end
73
+
74
+ def requires_base_url?
75
+ @name == :azure
76
+ end
77
+
78
+ def azure?
79
+ @name == :azure
80
+ end
81
+
82
+ def build_openai_client(api_key:, base_url:, azure_api_version: nil)
83
+ if azure?
84
+ OpenAI::Client.new(
85
+ access_token: api_key,
86
+ uri_base: base_url,
87
+ api_type: :azure,
88
+ api_version: azure_api_version
89
+ )
90
+ elsif base_url
91
+ OpenAI::Client.new(access_token: api_key, uri_base: base_url)
92
+ else
93
+ OpenAI::Client.new(access_token: api_key)
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,86 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Broadlistening
6
+ class SpecLoader
7
+ # Python版のステップ名をRuby gem用に変換するマッピング
8
+ STEP_MAPPING = {
9
+ "extraction" => :extraction,
10
+ "embedding" => :embedding,
11
+ "hierarchical_clustering" => :clustering,
12
+ "hierarchical_initial_labelling" => :initial_labelling,
13
+ "hierarchical_merge_labelling" => :merge_labelling,
14
+ "hierarchical_overview" => :overview,
15
+ "hierarchical_aggregation" => :aggregation,
16
+ "hierarchical_visualization" => nil # スキップ(gem責務外)
17
+ }.freeze
18
+
19
+ # Ruby gem独自の中間ファイル名
20
+ OUTPUT_FILES = {
21
+ extraction: "extraction.json",
22
+ embedding: "embeddings.json",
23
+ clustering: "clustering.json",
24
+ initial_labelling: "initial_labels.json",
25
+ merge_labelling: "merge_labels.json",
26
+ overview: "overview.json",
27
+ aggregation: "result.json"
28
+ }.freeze
29
+
30
+ attr_reader :specs
31
+
32
+ def initialize(specs_path)
33
+ raw_specs = JSON.parse(File.read(specs_path), symbolize_names: true)
34
+ @specs = convert_specs(raw_specs)
35
+ end
36
+
37
+ def self.default
38
+ new(default_specs_path)
39
+ end
40
+
41
+ def self.default_specs_path
42
+ ENV.fetch("BROADLISTENING_SPECS_PATH") do
43
+ File.expand_path("../../../../../server/broadlistening/pipeline/hierarchical_specs.json", __dir__)
44
+ end
45
+ end
46
+
47
+ def find(step_name)
48
+ @specs.find { |s| s[:step] == step_name.to_sym }
49
+ end
50
+
51
+ def steps
52
+ @specs.map { |s| s[:step] }
53
+ end
54
+
55
+ private
56
+
57
+ def convert_specs(raw_specs)
58
+ raw_specs.filter_map do |spec|
59
+ ruby_step = STEP_MAPPING[spec[:step]]
60
+ next if ruby_step.nil? # hierarchical_visualization等をスキップ
61
+
62
+ {
63
+ step: ruby_step,
64
+ output_file: OUTPUT_FILES[ruby_step],
65
+ dependencies: convert_dependencies(spec),
66
+ use_llm: spec[:use_llm] || false
67
+ }
68
+ end
69
+ end
70
+
71
+ def convert_dependencies(spec)
72
+ deps = spec[:dependencies] || {}
73
+ params = (deps[:params] || []).map(&:to_sym)
74
+
75
+ # use_llm が true の場合、prompt と model を自動追加
76
+ if spec[:use_llm]
77
+ params << :prompt unless params.include?(:prompt)
78
+ params << :model unless params.include?(:model)
79
+ end
80
+
81
+ steps = (deps[:steps] || []).filter_map { |s| STEP_MAPPING[s] }
82
+
83
+ { params: params.uniq, steps: steps }
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+ require "pathname"
6
+ require "time"
7
+ require "digest"
8
+
9
+ module Broadlistening
10
+ class Status
11
+ LOCK_DURATION = 300 # 5分
12
+
13
+ attr_reader :output_dir, :status_file, :data
14
+
15
+ def initialize(output_dir)
16
+ @output_dir = Pathname.new(output_dir)
17
+ @status_file = @output_dir / "status.json"
18
+ @data = load_or_initialize
19
+ end
20
+
21
+ def load_or_initialize
22
+ if status_file.exist?
23
+ JSON.parse(status_file.read, symbolize_names: true)
24
+ else
25
+ {
26
+ status: "initialized",
27
+ completed_jobs: [],
28
+ previously_completed_jobs: []
29
+ }
30
+ end
31
+ end
32
+
33
+ def save
34
+ FileUtils.mkdir_p(output_dir)
35
+ status_file.write(JSON.pretty_generate(@data))
36
+ end
37
+
38
+ def start_pipeline(plan)
39
+ @data.merge!(
40
+ status: "running",
41
+ plan: plan.map { |p| serialize_plan_entry(p) },
42
+ start_time: Time.now.iso8601,
43
+ completed_jobs: [],
44
+ lock_until: lock_time.iso8601
45
+ )
46
+ save
47
+ end
48
+
49
+ def start_step(step_name)
50
+ @data[:current_job] = step_name.to_s
51
+ @data[:current_job_started] = Time.now.iso8601
52
+ @data[:lock_until] = lock_time.iso8601
53
+ save
54
+ end
55
+
56
+ def complete_step(step_name, params:, duration:, token_usage: 0)
57
+ @data[:completed_jobs] ||= []
58
+ @data[:completed_jobs] << {
59
+ step: step_name.to_s,
60
+ completed: Time.now.iso8601,
61
+ duration: duration,
62
+ params: serialize_params(params),
63
+ token_usage: token_usage
64
+ }
65
+ @data.delete(:current_job)
66
+ @data.delete(:current_job_started)
67
+ save
68
+ end
69
+
70
+ def complete_pipeline
71
+ merge_previous_jobs
72
+ @data[:status] = "completed"
73
+ @data[:end_time] = Time.now.iso8601
74
+ @data.delete(:previous)
75
+ save
76
+ end
77
+
78
+ def error_pipeline(error)
79
+ @data[:status] = "error"
80
+ @data[:end_time] = Time.now.iso8601
81
+ @data[:error] = "#{error.class}: #{error.message}"
82
+ @data[:error_stack_trace] = error.backtrace&.join("\n")
83
+ save
84
+ end
85
+
86
+ def locked?
87
+ return false unless @data[:status] == "running"
88
+ return false unless @data[:lock_until]
89
+
90
+ Time.parse(@data[:lock_until]) > Time.now
91
+ end
92
+
93
+ def previous_completed_jobs
94
+ (@data[:completed_jobs] || []) + (@data[:previously_completed_jobs] || [])
95
+ end
96
+
97
+ private
98
+
99
+ def lock_time
100
+ Time.now + LOCK_DURATION
101
+ end
102
+
103
+ def serialize_plan_entry(entry)
104
+ {
105
+ step: entry[:step].to_s,
106
+ run: entry[:run],
107
+ reason: entry[:reason]
108
+ }
109
+ end
110
+
111
+ def serialize_params(params)
112
+ params.transform_values do |v|
113
+ # プロンプトなど長い文字列はハッシュ化して保存(サイズ削減・比較用)
114
+ if v.is_a?(String) && v.length > 100
115
+ Digest::SHA256.hexdigest(v)
116
+ else
117
+ v
118
+ end
119
+ end
120
+ end
121
+
122
+ def merge_previous_jobs
123
+ return unless @data[:previous]
124
+
125
+ old_jobs = @data[:previous][:completed_jobs] || []
126
+ old_jobs += @data[:previous][:previously_completed_jobs] || []
127
+
128
+ newly_completed = @data[:completed_jobs].map { |j| j[:step] }
129
+ @data[:previously_completed_jobs] = old_jobs.reject { |j| newly_completed.include?(j[:step]) }
130
+ end
131
+ end
132
+ end