broadlistening 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/CHANGELOG.md +40 -0
  5. data/CLAUDE.md +112 -0
  6. data/LICENSE +24 -0
  7. data/LICENSE-AGPLv3.txt +661 -0
  8. data/README.md +195 -0
  9. data/Rakefile +77 -0
  10. data/exe/broadlistening +6 -0
  11. data/lib/broadlistening/argument.rb +136 -0
  12. data/lib/broadlistening/cli.rb +196 -0
  13. data/lib/broadlistening/comment.rb +128 -0
  14. data/lib/broadlistening/compatibility.rb +375 -0
  15. data/lib/broadlistening/config.rb +190 -0
  16. data/lib/broadlistening/context.rb +180 -0
  17. data/lib/broadlistening/csv_loader.rb +109 -0
  18. data/lib/broadlistening/hierarchical_clustering.rb +142 -0
  19. data/lib/broadlistening/kmeans.rb +185 -0
  20. data/lib/broadlistening/llm_client.rb +84 -0
  21. data/lib/broadlistening/pipeline.rb +129 -0
  22. data/lib/broadlistening/planner.rb +114 -0
  23. data/lib/broadlistening/provider.rb +97 -0
  24. data/lib/broadlistening/spec_loader.rb +86 -0
  25. data/lib/broadlistening/status.rb +132 -0
  26. data/lib/broadlistening/steps/aggregation.rb +228 -0
  27. data/lib/broadlistening/steps/base_step.rb +42 -0
  28. data/lib/broadlistening/steps/clustering.rb +103 -0
  29. data/lib/broadlistening/steps/embedding.rb +40 -0
  30. data/lib/broadlistening/steps/extraction.rb +73 -0
  31. data/lib/broadlistening/steps/initial_labelling.rb +85 -0
  32. data/lib/broadlistening/steps/merge_labelling.rb +93 -0
  33. data/lib/broadlistening/steps/overview.rb +36 -0
  34. data/lib/broadlistening/version.rb +5 -0
  35. data/lib/broadlistening.rb +44 -0
  36. data/schema/hierarchical_result.json +152 -0
  37. data/sig/broadlistening.rbs +4 -0
  38. metadata +194 -0
@@ -0,0 +1,128 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Broadlistening
4
+ # Represents a normalized comment in the pipeline.
5
+ #
6
+ # Comments are the input data to the pipeline, containing user opinions
7
+ # that will be processed into arguments through the extraction step.
8
+ #
9
+ # @example Creating from a hash
10
+ # comment = Comment.from_hash({ id: "1", body: "I think...", attribute_age: "30代" })
11
+ # comment.id # => "1"
12
+ # comment.body # => "I think..."
13
+ # comment.attributes # => { "age" => "30代" }
14
+ class Comment
15
+ attr_accessor :id, :body, :proposal_id, :source_url, :attributes, :properties
16
+
17
+ def initialize(id:, body:, proposal_id: nil, source_url: nil, attributes: nil, properties: nil)
18
+ @id = id
19
+ @body = body
20
+ @proposal_id = proposal_id
21
+ @source_url = source_url
22
+ @attributes = attributes
23
+ @properties = properties
24
+ end
25
+
26
+ # Create a Comment from a hash, normalizing various input formats
27
+ #
28
+ # @param hash [Hash] Input hash with comment data
29
+ # @param property_names [Array<String>] Property names to extract (from config)
30
+ # @return [Comment]
31
+ def self.from_hash(hash, property_names: [])
32
+ new(
33
+ id: hash[:id] || hash["id"],
34
+ body: hash[:body] || hash["body"],
35
+ proposal_id: hash[:proposal_id] || hash["proposal_id"],
36
+ source_url: extract_source_url(hash),
37
+ attributes: extract_attributes(hash),
38
+ properties: extract_properties(hash, property_names)
39
+ )
40
+ end
41
+
42
+ # Create a Comment from an object (e.g., ActiveRecord model)
43
+ #
44
+ # @param obj [Object] Object responding to id, body, etc.
45
+ # @param property_names [Array<String>] Property names to extract
46
+ # @return [Comment]
47
+ def self.from_object(obj, property_names: [])
48
+ new(
49
+ id: obj.id,
50
+ body: obj.body,
51
+ proposal_id: obj.respond_to?(:proposal_id) ? obj.proposal_id : nil,
52
+ source_url: obj.respond_to?(:source_url) ? obj.source_url : nil,
53
+ attributes: extract_attributes_from_object(obj),
54
+ properties: extract_properties_from_object(obj, property_names)
55
+ )
56
+ end
57
+
58
+ # Convert to hash for serialization
59
+ #
60
+ # @return [Hash]
61
+ def to_h
62
+ {
63
+ id: @id,
64
+ body: @body,
65
+ proposal_id: @proposal_id,
66
+ source_url: @source_url,
67
+ attributes: @attributes,
68
+ properties: @properties
69
+ }
70
+ end
71
+
72
+ # Check if comment body is empty or nil
73
+ #
74
+ # @return [Boolean]
75
+ def empty?
76
+ @body.nil? || @body.strip.empty?
77
+ end
78
+
79
+ class << self
80
+ private
81
+
82
+ def extract_source_url(hash)
83
+ hash[:source_url] || hash["source_url"] ||
84
+ hash[:"source-url"] || hash["source-url"]
85
+ end
86
+
87
+ def extract_attributes(hash)
88
+ attributes = {}
89
+ hash.each do |key, value|
90
+ key_str = key.to_s
91
+ next unless key_str.start_with?("attribute_") || key_str.start_with?("attribute-")
92
+
93
+ attr_name = key_str.sub(/^attribute[-_]/, "")
94
+ attributes[attr_name] = value
95
+ end
96
+ attributes.empty? ? nil : attributes
97
+ end
98
+
99
+ def extract_attributes_from_object(obj)
100
+ return nil unless obj.respond_to?(:attributes) && obj.attributes.is_a?(Hash)
101
+
102
+ obj.attributes.empty? ? nil : obj.attributes
103
+ end
104
+
105
+ def extract_properties(hash, property_names)
106
+ return nil if property_names.empty?
107
+
108
+ properties = {}
109
+ property_names.each do |prop_name|
110
+ value = hash[prop_name.to_sym] || hash[prop_name.to_s]
111
+ properties[prop_name.to_s] = value
112
+ end
113
+ properties.values.all?(&:nil?) ? nil : properties
114
+ end
115
+
116
+ def extract_properties_from_object(obj, property_names)
117
+ return nil if property_names.empty?
118
+
119
+ properties = {}
120
+ property_names.each do |prop_name|
121
+ value = obj.respond_to?(prop_name) ? obj.public_send(prop_name) : nil
122
+ properties[prop_name.to_s] = value
123
+ end
124
+ properties.values.all?(&:nil?) ? nil : properties
125
+ end
126
+ end
127
+ end
128
+ end
@@ -0,0 +1,375 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "json_schemer"
5
+
6
+ module Broadlistening
7
+ # Compatibility utilities for comparing outputs between
8
+ # Kouchou-AI (Python) and Broadlistening gem (Ruby) implementations.
9
+ #
10
+ # @example Compare two output files
11
+ # report = Compatibility.compare_outputs(
12
+ # python_output: "path/to/python/result.json",
13
+ # ruby_output: "path/to/ruby/result.json"
14
+ # )
15
+ # puts report.summary
16
+ #
17
+ # @example Validate output against schema
18
+ # errors = Compatibility.validate_output(result_hash)
19
+ module Compatibility
20
+ # Expected structure for hierarchical_result.json
21
+ REQUIRED_TOP_LEVEL_KEYS = %w[arguments clusters comments propertyMap translations overview config].freeze
22
+ REQUIRED_ARGUMENT_KEYS = %w[arg_id argument comment_id x y cluster_ids].freeze
23
+ REQUIRED_CLUSTER_KEYS = %w[level id label takeaway value parent].freeze
24
+
25
+ # Path to JSON Schema file
26
+ SCHEMA_PATH = File.expand_path("../../schema/hierarchical_result.json", __dir__)
27
+
28
+ class ComparisonReport
29
+ attr_accessor :differences, :python_stats, :ruby_stats
30
+
31
+ def initialize
32
+ @differences = []
33
+ @python_stats = {}
34
+ @ruby_stats = {}
35
+ end
36
+
37
+ def add_difference(category, message, details = {})
38
+ @differences << {
39
+ category: category,
40
+ message: message,
41
+ details: details
42
+ }
43
+ end
44
+
45
+ def compatible?
46
+ @differences.empty?
47
+ end
48
+
49
+ def summary
50
+ lines = []
51
+ lines << "=" * 60
52
+ lines << "Compatibility Report"
53
+ lines << "=" * 60
54
+ lines << ""
55
+
56
+ lines << "Python Output Stats:"
57
+ @python_stats.each { |k, v| lines << " #{k}: #{v}" }
58
+ lines << ""
59
+
60
+ lines << "Ruby Output Stats:"
61
+ @ruby_stats.each { |k, v| lines << " #{k}: #{v}" }
62
+ lines << ""
63
+
64
+ if compatible?
65
+ lines << "Result: COMPATIBLE"
66
+ else
67
+ lines << "Result: INCOMPATIBLE (#{@differences.size} differences found)"
68
+ lines << ""
69
+ lines << "Differences:"
70
+ @differences.each_with_index do |diff, i|
71
+ lines << " #{i + 1}. [#{diff[:category]}] #{diff[:message]}"
72
+ diff[:details].each { |k, v| lines << " #{k}: #{v}" } if diff[:details].any?
73
+ end
74
+ end
75
+
76
+ lines << ""
77
+ lines << "=" * 60
78
+ lines.join("\n")
79
+ end
80
+
81
+ def to_h
82
+ {
83
+ compatible: compatible?,
84
+ python_stats: @python_stats,
85
+ ruby_stats: @ruby_stats,
86
+ differences: @differences
87
+ }
88
+ end
89
+ end
90
+
91
+ class << self
92
+ # Compare outputs from Python and Ruby implementations
93
+ #
94
+ # @param python_output [String, Hash] Path to JSON file or parsed hash
95
+ # @param ruby_output [String, Hash] Path to JSON file or parsed hash
96
+ # @return [ComparisonReport]
97
+ def compare_outputs(python_output:, ruby_output:)
98
+ python_data = load_output(python_output)
99
+ ruby_data = load_output(ruby_output)
100
+
101
+ report = ComparisonReport.new
102
+ report.python_stats = collect_stats(python_data)
103
+ report.ruby_stats = collect_stats(ruby_data)
104
+
105
+ compare_structure(python_data, ruby_data, report)
106
+ compare_arguments(python_data, ruby_data, report)
107
+ compare_clusters(python_data, ruby_data, report)
108
+ compare_overview(python_data, ruby_data, report)
109
+
110
+ report
111
+ end
112
+
113
+ # Validate output structure
114
+ #
115
+ # @param output [Hash] Parsed output hash
116
+ # @return [Array<String>] List of validation errors
117
+ def validate_output(output)
118
+ errors = []
119
+
120
+ # Check top-level keys
121
+ missing_keys = REQUIRED_TOP_LEVEL_KEYS - output.keys.map(&:to_s)
122
+ errors << "Missing top-level keys: #{missing_keys.join(', ')}" if missing_keys.any?
123
+
124
+ # Check arguments structure
125
+ if output["arguments"] || output[:arguments]
126
+ args = output["arguments"] || output[:arguments]
127
+ if args.is_a?(Array) && args.any?
128
+ sample = args.first
129
+ sample_keys = sample.keys.map(&:to_s)
130
+ missing_arg_keys = REQUIRED_ARGUMENT_KEYS - sample_keys
131
+ errors << "Missing argument keys: #{missing_arg_keys.join(', ')}" if missing_arg_keys.any?
132
+ end
133
+ end
134
+
135
+ # Check clusters structure
136
+ if output["clusters"] || output[:clusters]
137
+ clusters = output["clusters"] || output[:clusters]
138
+ if clusters.is_a?(Array) && clusters.any?
139
+ sample = clusters.first
140
+ sample_keys = sample.keys.map(&:to_s)
141
+ missing_cluster_keys = REQUIRED_CLUSTER_KEYS - sample_keys
142
+ errors << "Missing cluster keys: #{missing_cluster_keys.join(', ')}" if missing_cluster_keys.any?
143
+ end
144
+ end
145
+
146
+ errors
147
+ end
148
+
149
+ # Check if output is structurally compatible with Kouchou-AI format
150
+ #
151
+ # @param output [Hash] Parsed output hash
152
+ # @return [Boolean]
153
+ def valid_output?(output)
154
+ validate_output(output).empty?
155
+ end
156
+
157
+ # Validate output against JSON Schema
158
+ #
159
+ # @param output [Hash, String] Parsed output hash or path to JSON file
160
+ # @return [Array<Hash>] List of validation errors from JSON Schema
161
+ def validate_with_schema(output)
162
+ data = output.is_a?(String) ? JSON.parse(File.read(output)) : output
163
+ data = deep_stringify_keys(data) if data.is_a?(Hash)
164
+
165
+ schema = JSONSchemer.schema(Pathname.new(SCHEMA_PATH))
166
+ errors = schema.validate(data).to_a
167
+
168
+ errors.map do |error|
169
+ {
170
+ path: error["data_pointer"],
171
+ message: error["error"],
172
+ details: error["details"] || {}
173
+ }
174
+ end
175
+ end
176
+
177
+ # Check if output is valid according to JSON Schema
178
+ #
179
+ # @param output [Hash, String] Parsed output hash or path to JSON file
180
+ # @return [Boolean]
181
+ def valid_schema?(output)
182
+ validate_with_schema(output).empty?
183
+ end
184
+
185
+ # Get the JSON Schema as a Hash
186
+ #
187
+ # @return [Hash] The JSON Schema
188
+ def schema
189
+ @schema ||= JSON.parse(File.read(SCHEMA_PATH))
190
+ end
191
+
192
+ # Get the path to the JSON Schema file
193
+ #
194
+ # @return [String] Path to schema file
195
+ def schema_path
196
+ SCHEMA_PATH
197
+ end
198
+
199
+ private
200
+
201
+ def load_output(output)
202
+ case output
203
+ when String
204
+ JSON.parse(File.read(output))
205
+ when Hash
206
+ deep_stringify_keys(output)
207
+ else
208
+ raise ArgumentError, "Output must be a file path (String) or Hash"
209
+ end
210
+ end
211
+
212
+ def deep_stringify_keys(hash)
213
+ hash.transform_keys(&:to_s).transform_values do |v|
214
+ case v
215
+ when Hash then deep_stringify_keys(v)
216
+ when Array then v.map { |e| e.is_a?(Hash) ? deep_stringify_keys(e) : e }
217
+ else v
218
+ end
219
+ end
220
+ end
221
+
222
+ def collect_stats(data)
223
+ {
224
+ argument_count: data["arguments"]&.size || 0,
225
+ cluster_count: data["clusters"]&.size || 0,
226
+ cluster_levels: data["clusters"]&.map { |c| c["level"] }&.uniq&.sort || [],
227
+ has_overview: !data["overview"].to_s.strip.empty?,
228
+ has_property_map: data["propertyMap"]&.any? || false,
229
+ top_level_keys: data.keys.sort
230
+ }
231
+ end
232
+
233
+ def compare_structure(python, ruby, report)
234
+ python_keys = python.keys.sort
235
+ ruby_keys = ruby.keys.sort
236
+
237
+ missing_in_ruby = python_keys - ruby_keys
238
+ extra_in_ruby = ruby_keys - python_keys
239
+
240
+ if missing_in_ruby.any?
241
+ report.add_difference(
242
+ :structure,
243
+ "Missing top-level keys in Ruby output",
244
+ missing: missing_in_ruby
245
+ )
246
+ end
247
+
248
+ if extra_in_ruby.any?
249
+ report.add_difference(
250
+ :structure,
251
+ "Extra top-level keys in Ruby output",
252
+ extra: extra_in_ruby
253
+ )
254
+ end
255
+ end
256
+
257
+ def compare_arguments(python, ruby, report)
258
+ python_args = python["arguments"] || []
259
+ ruby_args = ruby["arguments"] || []
260
+
261
+ # Compare argument structure (keys)
262
+ if python_args.any? && ruby_args.any?
263
+ python_keys = python_args.first.keys.sort
264
+ ruby_keys = ruby_args.first.keys.sort
265
+
266
+ missing_keys = python_keys - ruby_keys
267
+ if missing_keys.any?
268
+ report.add_difference(
269
+ :arguments,
270
+ "Missing argument keys in Ruby output",
271
+ missing: missing_keys
272
+ )
273
+ end
274
+ end
275
+
276
+ # Compare cluster_ids format
277
+ if python_args.any? && ruby_args.any?
278
+ python_cluster_ids = python_args.first["cluster_ids"]
279
+ ruby_cluster_ids = ruby_args.first["cluster_ids"]
280
+
281
+ if python_cluster_ids.is_a?(Array) && ruby_cluster_ids.is_a?(Array)
282
+ # Check format consistency (e.g., "0", "1_5", "2_10")
283
+ python_format = detect_cluster_id_format(python_cluster_ids)
284
+ ruby_format = detect_cluster_id_format(ruby_cluster_ids)
285
+
286
+ if python_format != ruby_format
287
+ report.add_difference(
288
+ :arguments,
289
+ "cluster_ids format mismatch",
290
+ python_format: python_format,
291
+ ruby_format: ruby_format
292
+ )
293
+ end
294
+ end
295
+ end
296
+ end
297
+
298
+ def compare_clusters(python, ruby, report)
299
+ python_clusters = python["clusters"] || []
300
+ ruby_clusters = ruby["clusters"] || []
301
+
302
+ # Compare cluster structure
303
+ if python_clusters.any? && ruby_clusters.any?
304
+ python_keys = python_clusters.first.keys.sort
305
+ ruby_keys = ruby_clusters.first.keys.sort
306
+
307
+ missing_keys = python_keys - ruby_keys
308
+ if missing_keys.any?
309
+ report.add_difference(
310
+ :clusters,
311
+ "Missing cluster keys in Ruby output",
312
+ missing: missing_keys
313
+ )
314
+ end
315
+ end
316
+
317
+ # Compare hierarchy levels
318
+ python_levels = python_clusters.map { |c| c["level"] }.uniq.sort
319
+ ruby_levels = ruby_clusters.map { |c| c["level"] }.uniq.sort
320
+
321
+ if python_levels != ruby_levels
322
+ report.add_difference(
323
+ :clusters,
324
+ "Cluster hierarchy levels differ",
325
+ python_levels: python_levels,
326
+ ruby_levels: ruby_levels
327
+ )
328
+ end
329
+
330
+ # Compare root cluster
331
+ python_root = python_clusters.find { |c| c["level"] == 0 }
332
+ ruby_root = ruby_clusters.find { |c| c["level"] == 0 }
333
+
334
+ if python_root && ruby_root
335
+ if python_root["id"] != ruby_root["id"]
336
+ report.add_difference(
337
+ :clusters,
338
+ "Root cluster ID differs",
339
+ python_id: python_root["id"],
340
+ ruby_id: ruby_root["id"]
341
+ )
342
+ end
343
+ end
344
+ end
345
+
346
+ def compare_overview(python, ruby, report)
347
+ python_overview = python["overview"].to_s.strip
348
+ ruby_overview = ruby["overview"].to_s.strip
349
+
350
+ if python_overview.empty? != ruby_overview.empty?
351
+ report.add_difference(
352
+ :overview,
353
+ "Overview presence differs",
354
+ python_has_overview: !python_overview.empty?,
355
+ ruby_has_overview: !ruby_overview.empty?
356
+ )
357
+ end
358
+ end
359
+
360
+ def detect_cluster_id_format(cluster_ids)
361
+ return :empty if cluster_ids.empty?
362
+
363
+ formats = cluster_ids.map do |id|
364
+ case id.to_s
365
+ when /^\d+$/ then :numeric
366
+ when /^\d+_\d+$/ then :level_index
367
+ else :other
368
+ end
369
+ end
370
+
371
+ formats.uniq.size == 1 ? formats.first : :mixed
372
+ end
373
+ end
374
+ end
375
+ end
@@ -0,0 +1,190 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Broadlistening
6
+ class Config
7
+ attr_reader :model, :embedding_model, :provider, :cluster_nums, :workers, :prompts, :api_key,
8
+ :enable_source_link, :hidden_properties, :is_pubcom,
9
+ :api_base_url, :local_llm_address, :azure_api_version,
10
+ :input, :question, :name, :intro
11
+
12
+ DEFAULT_CLUSTER_NUMS = [ 5, 15 ].freeze
13
+ DEFAULT_WORKERS = 10
14
+ DEFAULT_AZURE_API_VERSION = "2024-02-15-preview"
15
+
16
+ def self.from_json(json_string)
17
+ data = JSON.parse(json_string, symbolize_names: true)
18
+ from_hash(data)
19
+ end
20
+
21
+ def self.from_hash(hash)
22
+ prompts = hash[:prompts]&.transform_keys(&:to_sym)
23
+
24
+ cluster_nums = hash[:cluster_nums] || hash.dig(:hierarchical_clustering, :cluster_nums)
25
+ workers = hash[:workers] || hash.dig(:extraction, :workers)
26
+ hidden_properties = hash[:hidden_properties] || hash.dig(:aggregation, :hidden_properties)
27
+
28
+ new(
29
+ api_key: hash[:api_key],
30
+ model: hash[:model],
31
+ embedding_model: hash[:embedding_model],
32
+ provider: hash[:provider],
33
+ cluster_nums: cluster_nums,
34
+ workers: workers,
35
+ prompts: prompts,
36
+ enable_source_link: hash[:enable_source_link],
37
+ hidden_properties: hidden_properties,
38
+ is_pubcom: hash[:is_pubcom],
39
+ api_base_url: hash[:api_base_url],
40
+ local_llm_address: hash[:local_llm_address],
41
+ azure_api_version: hash[:azure_api_version],
42
+ input: hash[:input],
43
+ question: hash[:question],
44
+ name: hash[:name],
45
+ intro: hash[:intro]
46
+ )
47
+ end
48
+
49
+ def self.from_file(path)
50
+ from_json(File.read(path))
51
+ end
52
+
53
+ def initialize(options = {})
54
+ @local_llm_address = options[:local_llm_address] || ENV.fetch("LOCAL_LLM_ADDRESS", "localhost:11434")
55
+ @provider_obj = Provider.new(
56
+ options[:provider]&.to_sym || :openai,
57
+ local_llm_address: @local_llm_address
58
+ )
59
+ @provider = @provider_obj.name
60
+ @model = options[:model] || @provider_obj.default_model
61
+ @embedding_model = options[:embedding_model] || @provider_obj.default_embedding_model
62
+ @cluster_nums = options[:cluster_nums] || DEFAULT_CLUSTER_NUMS.dup
63
+ @workers = options[:workers] || DEFAULT_WORKERS
64
+ @prompts = default_prompts.merge(options[:prompts] || {})
65
+ @api_key = options[:api_key] || @provider_obj.api_key
66
+ @enable_source_link = options.fetch(:enable_source_link, false)
67
+ @hidden_properties = options.fetch(:hidden_properties, {}) || {}
68
+ @is_pubcom = options.fetch(:is_pubcom, false)
69
+ @api_base_url = options[:api_base_url] || @provider_obj.base_url
70
+ @azure_api_version = options[:azure_api_version] || ENV.fetch("AZURE_API_VERSION", DEFAULT_AZURE_API_VERSION)
71
+ @input = options[:input]
72
+ @question = options[:question]
73
+ @name = options[:name]
74
+ @intro = options[:intro]
75
+
76
+ validate!
77
+ end
78
+
79
+ def to_h
80
+ {
81
+ model: model,
82
+ embedding_model: embedding_model,
83
+ provider: provider,
84
+ cluster_nums: cluster_nums,
85
+ workers: workers,
86
+ enable_source_link: enable_source_link,
87
+ hidden_properties: hidden_properties,
88
+ is_pubcom: is_pubcom,
89
+ api_base_url: api_base_url,
90
+ local_llm_address: local_llm_address,
91
+ azure_api_version: azure_api_version,
92
+ input: input,
93
+ question: question,
94
+ name: name,
95
+ intro: intro
96
+ }.compact
97
+ end
98
+
99
+ def to_json(*args)
100
+ to_h.merge(prompts: prompts).to_json(*args)
101
+ end
102
+
103
+ def save_to_file(path)
104
+ File.write(path, JSON.pretty_generate(to_h.merge(prompts: prompts)))
105
+ end
106
+
107
+ def property_names
108
+ hidden_properties.keys
109
+ end
110
+
111
+ private
112
+
113
+ def validate!
114
+ if @provider_obj.requires_api_key? && (api_key.nil? || api_key.empty?)
115
+ raise ConfigurationError, "API key is required"
116
+ end
117
+ if @provider_obj.requires_base_url? && (api_base_url.nil? || api_base_url.empty?)
118
+ raise ConfigurationError, "Azure requires api_base_url"
119
+ end
120
+ raise ConfigurationError, "cluster_nums must have at least 2 levels" if cluster_nums.size < 2
121
+ raise ConfigurationError, "cluster_nums must be sorted ascending" unless cluster_nums == cluster_nums.sort
122
+ end
123
+
124
+ def default_prompts
125
+ {
126
+ extraction: extraction_prompt,
127
+ initial_labelling: initial_labelling_prompt,
128
+ merge_labelling: merge_labelling_prompt,
129
+ overview: overview_prompt
130
+ }
131
+ end
132
+
133
+ def extraction_prompt
134
+ <<~PROMPT
135
+ あなたは意見抽出の専門家です。
136
+ 以下のコメントから、主要な意見や主張を抽出してください。
137
+ 1つのコメントに複数の意見が含まれる場合は、それぞれを別々に抽出してください。
138
+ 抽出した意見はJSON形式で返してください。
139
+
140
+ 出力フォーマット:
141
+ {"extractedOpinionList": ["意見1", "意見2", ...]}
142
+
143
+ 注意:
144
+ - 事実の記述ではなく、意見や主張を抽出してください
145
+ - 曖昧な表現は具体的に言い換えてください
146
+ - 重複する意見は1つにまとめてください
147
+ PROMPT
148
+ end
149
+
150
+ def initial_labelling_prompt
151
+ <<~PROMPT
152
+ あなたはクラスタ分析の専門家です。
153
+ 以下の意見グループに対して、適切なラベルと説明を付けてください。
154
+
155
+ 出力フォーマット:
156
+ {"label": "ラベル名", "description": "このグループの説明"}
157
+
158
+ 注意:
159
+ - ラベルは簡潔で分かりやすいものにしてください(10文字以内推奨)
160
+ - 説明はグループの特徴を端的に表してください(50文字以内推奨)
161
+ PROMPT
162
+ end
163
+
164
+ def merge_labelling_prompt
165
+ <<~PROMPT
166
+ あなたはクラスタ分析の専門家です。
167
+ 以下の子クラスタのラベルと説明を統合し、親クラスタのラベルと説明を作成してください。
168
+
169
+ 出力フォーマット:
170
+ {"label": "ラベル名", "description": "このグループの説明"}
171
+
172
+ 注意:
173
+ - 親ラベルは子ラベルの共通テーマを表すものにしてください
174
+ - 抽象度を上げすぎず、具体性を保ってください
175
+ PROMPT
176
+ end
177
+
178
+ def overview_prompt
179
+ <<~PROMPT
180
+ あなたは分析レポートの専門家です。
181
+ 以下のクラスタ分析結果に基づいて、全体の概要を作成してください。
182
+
183
+ 注意:
184
+ - 主要なテーマや傾向を簡潔にまとめてください
185
+ - 200-300文字程度で記述してください
186
+ - 客観的な記述を心がけてください
187
+ PROMPT
188
+ end
189
+ end
190
+ end