broadlistening 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/CHANGELOG.md +40 -0
  5. data/CLAUDE.md +112 -0
  6. data/LICENSE +24 -0
  7. data/LICENSE-AGPLv3.txt +661 -0
  8. data/README.md +195 -0
  9. data/Rakefile +77 -0
  10. data/exe/broadlistening +6 -0
  11. data/lib/broadlistening/argument.rb +136 -0
  12. data/lib/broadlistening/cli.rb +196 -0
  13. data/lib/broadlistening/comment.rb +128 -0
  14. data/lib/broadlistening/compatibility.rb +375 -0
  15. data/lib/broadlistening/config.rb +190 -0
  16. data/lib/broadlistening/context.rb +180 -0
  17. data/lib/broadlistening/csv_loader.rb +109 -0
  18. data/lib/broadlistening/hierarchical_clustering.rb +142 -0
  19. data/lib/broadlistening/kmeans.rb +185 -0
  20. data/lib/broadlistening/llm_client.rb +84 -0
  21. data/lib/broadlistening/pipeline.rb +129 -0
  22. data/lib/broadlistening/planner.rb +114 -0
  23. data/lib/broadlistening/provider.rb +97 -0
  24. data/lib/broadlistening/spec_loader.rb +86 -0
  25. data/lib/broadlistening/status.rb +132 -0
  26. data/lib/broadlistening/steps/aggregation.rb +228 -0
  27. data/lib/broadlistening/steps/base_step.rb +42 -0
  28. data/lib/broadlistening/steps/clustering.rb +103 -0
  29. data/lib/broadlistening/steps/embedding.rb +40 -0
  30. data/lib/broadlistening/steps/extraction.rb +73 -0
  31. data/lib/broadlistening/steps/initial_labelling.rb +85 -0
  32. data/lib/broadlistening/steps/merge_labelling.rb +93 -0
  33. data/lib/broadlistening/steps/overview.rb +36 -0
  34. data/lib/broadlistening/version.rb +5 -0
  35. data/lib/broadlistening.rb +44 -0
  36. data/schema/hierarchical_result.json +152 -0
  37. data/sig/broadlistening.rbs +4 -0
  38. metadata +194 -0
data/README.md ADDED
@@ -0,0 +1,195 @@
1
+ # Broadlistening
2
+
3
+ 広聴 AIのBroadlistening パイプラインの Ruby 実装です。LLM を使用して公開コメントをクラスタリング・分析します。
4
+
5
+ ## 概要
6
+
7
+ Broadlistening は、大量のコメントや意見を AI を活用して分析するためのパイプラインです。以下のステップで処理を行います:
8
+
9
+ 1. **Extraction (意見抽出)** - コメントから主要な意見を LLM で抽出
10
+ 2. **Embedding (ベクトル化)** - 抽出した意見をベクトル化
11
+ 3. **Clustering (クラスタリング)** - UMAP + KMeans + 階層的クラスタリング
12
+ 4. **Initial Labelling (初期ラベリング)** - 各クラスタに LLM でラベル付け
13
+ 5. **Merge Labelling (ラベル統合)** - 階層的にラベルを統合
14
+ 6. **Overview (概要生成)** - 全体の概要を LLM で生成
15
+ 7. **Aggregation (JSON 組み立て)** - 結果を JSON 形式で出力
16
+
17
+ ## インストール
18
+
19
+ ### Gemfile に追加
20
+
21
+ ```ruby
22
+ gem 'broadlistening'
23
+ ```
24
+
25
+ または GitHub から直接インストール:
26
+
27
+ ```ruby
28
+ gem 'broadlistening', github: 'takahashim/broadlistening-ruby'
29
+ ```
30
+
31
+ ### 依存関係のインストール
32
+
33
+ ```bash
34
+ bundle install
35
+ ```
36
+
37
+ ## 使い方
38
+
39
+ ### 基本的な使用方法
40
+
41
+ ```ruby
42
+ require 'broadlistening'
43
+
44
+ # コメントデータを準備
45
+ comments = [
46
+ { id: "1", body: "環境問題への対策が必要です", proposal_id: "123" },
47
+ { id: "2", body: "公共交通機関の充実を希望します", proposal_id: "123" },
48
+ # ...
49
+ ]
50
+
51
+ # パイプラインを実行
52
+ pipeline = Broadlistening::Pipeline.new(
53
+ api_key: ENV['OPENAI_API_KEY'],
54
+ model: "gpt-4o-mini",
55
+ cluster_nums: [5, 15]
56
+ )
57
+ result = pipeline.run(comments)
58
+
59
+ # 結果を取得
60
+ puts result[:overview]
61
+ puts result[:clusters]
62
+ ```
63
+
64
+ ### Rails での使用例
65
+
66
+ ```ruby
67
+ # app/jobs/analysis_job.rb
68
+ class AnalysisJob < ApplicationJob
69
+ queue_as :analysis
70
+
71
+ def perform(proposal_id)
72
+ proposal = Proposal.find(proposal_id)
73
+ comments = proposal.comments.map do |c|
74
+ { id: c.id, body: c.body, proposal_id: c.proposal_id }
75
+ end
76
+
77
+ pipeline = Broadlistening::Pipeline.new(
78
+ api_key: ENV['OPENAI_API_KEY'],
79
+ model: "gpt-4o-mini",
80
+ cluster_nums: [5, 15]
81
+ )
82
+ result = pipeline.run(comments)
83
+
84
+ proposal.create_analysis_result!(
85
+ result_data: result,
86
+ comment_count: comments.size
87
+ )
88
+ end
89
+ end
90
+ ```
91
+
92
+ ### 設定オプション
93
+
94
+ ```ruby
95
+ Broadlistening::Pipeline.new(
96
+ api_key: "your-api-key", # OpenAI API キー(必須)
97
+ model: "gpt-4o-mini", # LLM モデル(デフォルト: gpt-4o-mini)
98
+ embedding_model: "text-embedding-3-small", # 埋め込みモデル
99
+ cluster_nums: [5, 15], # クラスタ階層の数(デフォルト: [5, 15])
100
+ workers: 10, # 並列処理のワーカー数
101
+ prompts: { # カスタムプロンプト(オプション)
102
+ extraction: "...",
103
+ initial_labelling: "...",
104
+ merge_labelling: "...",
105
+ overview: "..."
106
+ }
107
+ )
108
+ ```
109
+
110
+ ## 出力形式
111
+
112
+ パイプラインの結果は以下の構造を持つ Hash です:
113
+
114
+ ```ruby
115
+ {
116
+ arguments: [
117
+ {
118
+ arg_id: "A1_0",
119
+ argument: "環境問題への対策が必要",
120
+ x: 0.5, # UMAP X座標
121
+ y: 0.3, # UMAP Y座標
122
+ cluster_ids: ["0", "1_0", "2_3"] # 所属クラスタID
123
+ },
124
+ # ...
125
+ ],
126
+ clusters: [
127
+ {
128
+ level: 0,
129
+ id: "0",
130
+ label: "全体",
131
+ description: "",
132
+ count: 100,
133
+ parent: nil
134
+ },
135
+ {
136
+ level: 1,
137
+ id: "1_0",
138
+ label: "環境・エネルギー",
139
+ description: "環境問題やエネルギー政策に関する意見",
140
+ count: 25,
141
+ parent: "0"
142
+ },
143
+ # ...
144
+ ],
145
+ relations: [
146
+ { arg_id: "A1_0", comment_id: "1", proposal_id: "123" },
147
+ # ...
148
+ ],
149
+ comment_count: 50,
150
+ argument_count: 100,
151
+ overview: "分析の概要テキスト...",
152
+ config: { model: "gpt-4o-mini", ... }
153
+ }
154
+ ```
155
+
156
+ ## 依存関係
157
+
158
+ - Ruby >= 3.1.0
159
+ - activesupport >= 7.0
160
+ - numo-narray ~> 0.9
161
+ - ruby-openai ~> 7.0
162
+ - parallel ~> 1.20
163
+ - rice ~> 4.6.0
164
+ - umappp ~> 0.2
165
+
166
+ ### umappp のインストール
167
+
168
+ umappp は C++ ネイティブ拡張を含むため、インストール時に C++ コンパイラが必要です:
169
+
170
+ ```bash
171
+ # macOS
172
+ CXX=clang++ gem install umappp
173
+
174
+ # Linux
175
+ gem install umappp
176
+ ```
177
+
178
+ **注意**: Rice 4.7.x との互換性問題があるため、Rice 4.6.x を使用してください。
179
+
180
+ ## 開発
181
+
182
+ ```bash
183
+ # セットアップ
184
+ bin/setup
185
+
186
+ # テスト実行
187
+ bundle exec rspec
188
+
189
+ # コンソール
190
+ bin/console
191
+ ```
192
+
193
+ ## ライセンス
194
+
195
+ AGPL 3.0
data/Rakefile ADDED
@@ -0,0 +1,77 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ RSpec::Core::RakeTask.new("spec:compatibility") do |t|
9
+ t.pattern = "spec/compatibility/**/*_spec.rb"
10
+ end
11
+
12
+ namespace :compatibility do
13
+ desc "Validate Kouchou-AI Python output structure"
14
+ task :validate_python do
15
+ require "bundler/setup"
16
+ require "broadlistening"
17
+ require "json"
18
+
19
+ python_path = File.expand_path(
20
+ "../server/broadlistening/pipeline/outputs/example-hierarchical-polis/hierarchical_result.json",
21
+ __dir__
22
+ )
23
+
24
+ unless File.exist?(python_path)
25
+ puts "Error: Python output not found at: #{python_path}"
26
+ exit 1
27
+ end
28
+
29
+ output = JSON.parse(File.read(python_path))
30
+ errors = Broadlistening::Compatibility.validate_output(output)
31
+
32
+ if errors.empty?
33
+ puts "Valid: #{python_path}"
34
+ puts ""
35
+ puts "Stats:"
36
+ puts " Arguments: #{output['arguments'].size}"
37
+ puts " Clusters: #{output['clusters'].size}"
38
+ puts " Levels: #{output['clusters'].map { |c| c['level'] }.uniq.sort.join(', ')}"
39
+ puts " Has overview: #{!output['overview'].to_s.strip.empty?}"
40
+ else
41
+ puts "Invalid: #{python_path}"
42
+ errors.each { |e| puts " - #{e}" }
43
+ exit 1
44
+ end
45
+ end
46
+
47
+ desc "Compare Python and Ruby outputs"
48
+ task :compare, [ :python_file, :ruby_file ] do |_t, args|
49
+ require "bundler/setup"
50
+ require "broadlistening"
51
+
52
+ python_file = args[:python_file]
53
+ ruby_file = args[:ruby_file]
54
+
55
+ unless python_file && ruby_file
56
+ puts "Usage: rake compatibility:compare[python_output.json,ruby_output.json]"
57
+ exit 1
58
+ end
59
+
60
+ [ python_file, ruby_file ].each do |file|
61
+ unless File.exist?(file)
62
+ puts "Error: File not found: #{file}"
63
+ exit 1
64
+ end
65
+ end
66
+
67
+ report = Broadlistening::Compatibility.compare_outputs(
68
+ python_output: python_file,
69
+ ruby_output: ruby_file
70
+ )
71
+
72
+ puts report.summary
73
+ exit(report.compatible? ? 0 : 1)
74
+ end
75
+ end
76
+
77
+ task default: :spec
@@ -0,0 +1,6 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "broadlistening"
5
+
6
+ Broadlistening::CLI.new(ARGV).run
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Broadlistening
4
+ # Represents an extracted argument (opinion) from a comment.
5
+ #
6
+ # Arguments are created during the extraction step and enriched through
7
+ # subsequent pipeline steps (embedding, clustering).
8
+ #
9
+ # @example Creating an argument
10
+ # arg = Argument.new(arg_id: "A1_0", argument: "We need more parks", comment_id: "1")
11
+ # arg.embedding = [0.1, 0.2, 0.3] # Added by embedding step
12
+ # arg.x = 0.5 # Added by clustering step
13
+ class Argument
14
+ attr_accessor :arg_id, :argument, :comment_id,
15
+ :embedding, :x, :y, :cluster_ids,
16
+ :attributes, :url, :properties
17
+
18
+ def initialize(
19
+ arg_id:,
20
+ argument:,
21
+ comment_id:,
22
+ embedding: nil,
23
+ x: nil,
24
+ y: nil,
25
+ cluster_ids: nil,
26
+ attributes: nil,
27
+ url: nil,
28
+ properties: nil
29
+ )
30
+ @arg_id = arg_id
31
+ @argument = argument
32
+ @comment_id = comment_id
33
+ @embedding = embedding
34
+ @x = x
35
+ @y = y
36
+ @cluster_ids = cluster_ids
37
+ @attributes = attributes
38
+ @url = url
39
+ @properties = properties
40
+ end
41
+
42
+ # Create an Argument from a hash
43
+ #
44
+ # @param hash [Hash] Input hash with argument data
45
+ # @return [Argument]
46
+ def self.from_hash(hash)
47
+ new(
48
+ arg_id: hash[:arg_id] || hash["arg_id"],
49
+ argument: hash[:argument] || hash["argument"],
50
+ comment_id: hash[:comment_id] || hash["comment_id"],
51
+ embedding: hash[:embedding] || hash["embedding"],
52
+ x: hash[:x] || hash["x"],
53
+ y: hash[:y] || hash["y"],
54
+ cluster_ids: hash[:cluster_ids] || hash["cluster_ids"],
55
+ attributes: hash[:attributes] || hash["attributes"],
56
+ url: hash[:url] || hash["url"],
57
+ properties: hash[:properties] || hash["properties"]
58
+ )
59
+ end
60
+
61
+ # Create an Argument from a Comment during extraction
62
+ #
63
+ # @param comment [Comment] Source comment
64
+ # @param opinion_text [String] Extracted opinion text
65
+ # @param index [Integer] Opinion index within the comment
66
+ # @return [Argument]
67
+ def self.from_comment(comment, opinion_text, index)
68
+ new(
69
+ arg_id: "A#{comment.id}_#{index}",
70
+ argument: opinion_text,
71
+ comment_id: comment.id,
72
+ attributes: comment.attributes,
73
+ url: comment.source_url,
74
+ properties: comment.properties
75
+ )
76
+ end
77
+
78
+ # Convert to hash for serialization
79
+ #
80
+ # @return [Hash]
81
+ def to_h
82
+ {
83
+ arg_id: @arg_id,
84
+ argument: @argument,
85
+ comment_id: @comment_id,
86
+ embedding: @embedding,
87
+ x: @x,
88
+ y: @y,
89
+ cluster_ids: @cluster_ids,
90
+ attributes: @attributes,
91
+ url: @url,
92
+ properties: @properties
93
+ }.compact
94
+ end
95
+
96
+ # Convert to hash with only embedding data (for embeddings.json)
97
+ #
98
+ # @return [Hash]
99
+ def to_embedding_h
100
+ {
101
+ arg_id: @arg_id,
102
+ embedding: @embedding
103
+ }
104
+ end
105
+
106
+ # Convert to hash with only clustering data (for clustering.json)
107
+ #
108
+ # @return [Hash]
109
+ def to_clustering_h
110
+ {
111
+ arg_id: @arg_id,
112
+ x: @x,
113
+ y: @y,
114
+ cluster_ids: @cluster_ids
115
+ }
116
+ end
117
+
118
+ # Check if argument belongs to a specific cluster
119
+ #
120
+ # @param cluster_id [String] Cluster ID to check
121
+ # @return [Boolean]
122
+ def in_cluster?(cluster_id)
123
+ @cluster_ids&.include?(cluster_id) || false
124
+ end
125
+
126
+ # Extract numeric comment_id from arg_id if comment_id is not set
127
+ #
128
+ # @return [Integer]
129
+ def comment_id_int
130
+ return @comment_id.to_i if @comment_id
131
+
132
+ match = @arg_id&.match(/\AA(\d+)_/)
133
+ match ? match[1].to_i : 0
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,196 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+ require "json"
5
+ require "pathname"
6
+
7
+ module Broadlistening
8
+ class CLI
9
+ PIPELINE_DIR = Pathname.new(__dir__).parent.parent / "outputs"
10
+
11
+ attr_reader :options
12
+
13
+ def initialize(argv = ARGV)
14
+ @argv = argv
15
+ @options = {
16
+ force: false,
17
+ only: nil,
18
+ skip_interaction: false
19
+ }
20
+ end
21
+
22
+ def run
23
+ parse_options
24
+ validate_config_path
25
+
26
+ config = load_config
27
+ validate_config(config)
28
+
29
+ output_dir = determine_output_dir
30
+ ensure_output_dir(output_dir)
31
+
32
+ unless @options[:skip_interaction]
33
+ show_plan(config, output_dir)
34
+ confirm_execution || exit(0)
35
+ end
36
+
37
+ execute_pipeline(config, output_dir)
38
+ rescue Broadlistening::Error => e
39
+ $stderr.puts "Error: #{e.message}"
40
+ exit 1
41
+ rescue Interrupt
42
+ $stderr.puts "\nInterrupted"
43
+ exit 130
44
+ end
45
+
46
+ private
47
+
48
+ def parse_options
49
+ parser = OptionParser.new do |opts|
50
+ opts.banner = "Usage: broadlistening CONFIG [options]"
51
+ opts.separator ""
52
+ opts.separator "Run the broadlistening pipeline with the specified configuration."
53
+ opts.separator ""
54
+ opts.separator "Options:"
55
+
56
+ opts.on("-f", "--force", "Force re-run all steps regardless of previous execution") do
57
+ @options[:force] = true
58
+ end
59
+
60
+ opts.on("-o", "--only STEP", "Run only the specified step (e.g., extraction, embedding, clustering, etc.)") do |step|
61
+ @options[:only] = step.to_sym
62
+ end
63
+
64
+ opts.on("--skip-interaction", "Skip the interactive confirmation prompt and run pipeline immediately") do
65
+ @options[:skip_interaction] = true
66
+ end
67
+
68
+ opts.on("-h", "--help", "Show this help message") do
69
+ puts opts
70
+ exit 0
71
+ end
72
+
73
+ opts.on("-v", "--version", "Show version") do
74
+ puts "broadlistening #{Broadlistening::VERSION}"
75
+ exit 0
76
+ end
77
+ end
78
+
79
+ parser.parse!(@argv)
80
+ @config_path = @argv.first
81
+ end
82
+
83
+ def validate_config_path
84
+ unless @config_path
85
+ $stderr.puts "Error: CONFIG is required"
86
+ $stderr.puts "Usage: broadlistening CONFIG [options]"
87
+ exit 1
88
+ end
89
+
90
+ unless File.exist?(@config_path)
91
+ $stderr.puts "Error: Config file not found: #{@config_path}"
92
+ exit 1
93
+ end
94
+ end
95
+
96
+ def load_config
97
+ Config.from_file(@config_path)
98
+ rescue JSON::ParserError => e
99
+ raise Broadlistening::ConfigurationError, "Invalid JSON in config file: #{e.message}"
100
+ end
101
+
102
+ def validate_config(config)
103
+ raise Broadlistening::ConfigurationError, "Missing required field 'input' in config" unless config.input
104
+ raise Broadlistening::ConfigurationError, "Missing required field 'question' in config" unless config.question
105
+ raise Broadlistening::ConfigurationError, "Input file not found: #{config.input}" unless File.exist?(config.input)
106
+ end
107
+
108
+ def determine_output_dir
109
+ # Python版と同様: 設定ファイル名から出力ディレクトリを決定
110
+ # e.g., "config/my_report.json" -> "outputs/my_report"
111
+ config_basename = File.basename(@config_path, ".*")
112
+ PIPELINE_DIR / config_basename
113
+ end
114
+
115
+ def ensure_output_dir(output_dir)
116
+ FileUtils.mkdir_p(output_dir) unless output_dir.exist?
117
+ end
118
+
119
+ def show_plan(config, output_dir)
120
+ puts "So, here is what I am planning to run:"
121
+
122
+ planner = create_planner(config, output_dir)
123
+ plan = planner.create_plan(force: @options[:force], only: @options[:only])
124
+
125
+ plan.each do |step|
126
+ status = step[:run] ? "RUN" : "SKIP"
127
+ puts " #{step[:step]}: #{status} (#{step[:reason]})"
128
+ end
129
+
130
+ puts ""
131
+ end
132
+
133
+ def confirm_execution
134
+ print "Looks good? Press enter to continue or Ctrl+C to abort."
135
+ $stdin.gets
136
+ true
137
+ rescue Interrupt
138
+ puts ""
139
+ false
140
+ end
141
+
142
+ def create_planner(config, output_dir)
143
+ status = Status.new(output_dir)
144
+ Planner.new(config: config, status: status, output_dir: output_dir)
145
+ end
146
+
147
+ def execute_pipeline(config, output_dir)
148
+ comments = load_comments(config.input)
149
+
150
+ pipeline = Pipeline.new(config)
151
+
152
+ setup_progress_output
153
+
154
+ result = pipeline.run(
155
+ comments,
156
+ output_dir: output_dir.to_s,
157
+ force: @options[:force],
158
+ only: @options[:only]
159
+ )
160
+
161
+ puts ""
162
+ puts "Pipeline completed."
163
+
164
+ result
165
+ end
166
+
167
+ def load_comments(input_path)
168
+ case File.extname(input_path).downcase
169
+ when ".csv"
170
+ CsvLoader.load(input_path)
171
+ when ".json"
172
+ JSON.parse(File.read(input_path), symbolize_names: true)
173
+ else
174
+ raise Broadlistening::ConfigurationError, "Unsupported input format: #{File.extname(input_path)}"
175
+ end
176
+ end
177
+
178
+ def setup_progress_output
179
+ ActiveSupport::Notifications.subscribe("step.broadlistening") do |*, payload|
180
+ puts "Running step: #{payload[:step]}"
181
+ end
182
+
183
+ ActiveSupport::Notifications.subscribe("step.skip.broadlistening") do |*, payload|
184
+ puts "Skipping '#{payload[:step]}'"
185
+ end
186
+
187
+ ActiveSupport::Notifications.subscribe("progress.broadlistening") do |*, payload|
188
+ step = payload[:step]
189
+ current = payload[:current]
190
+ total = payload[:total]
191
+ print "\r #{step}: #{current}/#{total}"
192
+ puts "" if current == total
193
+ end
194
+ end
195
+ end
196
+ end