broadlistening 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +40 -0
- data/CLAUDE.md +112 -0
- data/LICENSE +24 -0
- data/LICENSE-AGPLv3.txt +661 -0
- data/README.md +195 -0
- data/Rakefile +77 -0
- data/exe/broadlistening +6 -0
- data/lib/broadlistening/argument.rb +136 -0
- data/lib/broadlistening/cli.rb +196 -0
- data/lib/broadlistening/comment.rb +128 -0
- data/lib/broadlistening/compatibility.rb +375 -0
- data/lib/broadlistening/config.rb +190 -0
- data/lib/broadlistening/context.rb +180 -0
- data/lib/broadlistening/csv_loader.rb +109 -0
- data/lib/broadlistening/hierarchical_clustering.rb +142 -0
- data/lib/broadlistening/kmeans.rb +185 -0
- data/lib/broadlistening/llm_client.rb +84 -0
- data/lib/broadlistening/pipeline.rb +129 -0
- data/lib/broadlistening/planner.rb +114 -0
- data/lib/broadlistening/provider.rb +97 -0
- data/lib/broadlistening/spec_loader.rb +86 -0
- data/lib/broadlistening/status.rb +132 -0
- data/lib/broadlistening/steps/aggregation.rb +228 -0
- data/lib/broadlistening/steps/base_step.rb +42 -0
- data/lib/broadlistening/steps/clustering.rb +103 -0
- data/lib/broadlistening/steps/embedding.rb +40 -0
- data/lib/broadlistening/steps/extraction.rb +73 -0
- data/lib/broadlistening/steps/initial_labelling.rb +85 -0
- data/lib/broadlistening/steps/merge_labelling.rb +93 -0
- data/lib/broadlistening/steps/overview.rb +36 -0
- data/lib/broadlistening/version.rb +5 -0
- data/lib/broadlistening.rb +44 -0
- data/schema/hierarchical_result.json +152 -0
- data/sig/broadlistening.rbs +4 -0
- metadata +194 -0
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Broadlistening
|
|
4
|
+
# Represents a normalized comment in the pipeline.
|
|
5
|
+
#
|
|
6
|
+
# Comments are the input data to the pipeline, containing user opinions
|
|
7
|
+
# that will be processed into arguments through the extraction step.
|
|
8
|
+
#
|
|
9
|
+
# @example Creating from a hash
|
|
10
|
+
# comment = Comment.from_hash({ id: "1", body: "I think...", attribute_age: "30代" })
|
|
11
|
+
# comment.id # => "1"
|
|
12
|
+
# comment.body # => "I think..."
|
|
13
|
+
# comment.attributes # => { "age" => "30代" }
|
|
14
|
+
class Comment
|
|
15
|
+
attr_accessor :id, :body, :proposal_id, :source_url, :attributes, :properties
|
|
16
|
+
|
|
17
|
+
def initialize(id:, body:, proposal_id: nil, source_url: nil, attributes: nil, properties: nil)
|
|
18
|
+
@id = id
|
|
19
|
+
@body = body
|
|
20
|
+
@proposal_id = proposal_id
|
|
21
|
+
@source_url = source_url
|
|
22
|
+
@attributes = attributes
|
|
23
|
+
@properties = properties
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Create a Comment from a hash, normalizing various input formats
|
|
27
|
+
#
|
|
28
|
+
# @param hash [Hash] Input hash with comment data
|
|
29
|
+
# @param property_names [Array<String>] Property names to extract (from config)
|
|
30
|
+
# @return [Comment]
|
|
31
|
+
def self.from_hash(hash, property_names: [])
|
|
32
|
+
new(
|
|
33
|
+
id: hash[:id] || hash["id"],
|
|
34
|
+
body: hash[:body] || hash["body"],
|
|
35
|
+
proposal_id: hash[:proposal_id] || hash["proposal_id"],
|
|
36
|
+
source_url: extract_source_url(hash),
|
|
37
|
+
attributes: extract_attributes(hash),
|
|
38
|
+
properties: extract_properties(hash, property_names)
|
|
39
|
+
)
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
# Create a Comment from an object (e.g., ActiveRecord model)
|
|
43
|
+
#
|
|
44
|
+
# @param obj [Object] Object responding to id, body, etc.
|
|
45
|
+
# @param property_names [Array<String>] Property names to extract
|
|
46
|
+
# @return [Comment]
|
|
47
|
+
def self.from_object(obj, property_names: [])
|
|
48
|
+
new(
|
|
49
|
+
id: obj.id,
|
|
50
|
+
body: obj.body,
|
|
51
|
+
proposal_id: obj.respond_to?(:proposal_id) ? obj.proposal_id : nil,
|
|
52
|
+
source_url: obj.respond_to?(:source_url) ? obj.source_url : nil,
|
|
53
|
+
attributes: extract_attributes_from_object(obj),
|
|
54
|
+
properties: extract_properties_from_object(obj, property_names)
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Convert to hash for serialization
|
|
59
|
+
#
|
|
60
|
+
# @return [Hash]
|
|
61
|
+
def to_h
|
|
62
|
+
{
|
|
63
|
+
id: @id,
|
|
64
|
+
body: @body,
|
|
65
|
+
proposal_id: @proposal_id,
|
|
66
|
+
source_url: @source_url,
|
|
67
|
+
attributes: @attributes,
|
|
68
|
+
properties: @properties
|
|
69
|
+
}
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Check if comment body is empty or nil
|
|
73
|
+
#
|
|
74
|
+
# @return [Boolean]
|
|
75
|
+
def empty?
|
|
76
|
+
@body.nil? || @body.strip.empty?
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
class << self
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
def extract_source_url(hash)
|
|
83
|
+
hash[:source_url] || hash["source_url"] ||
|
|
84
|
+
hash[:"source-url"] || hash["source-url"]
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def extract_attributes(hash)
|
|
88
|
+
attributes = {}
|
|
89
|
+
hash.each do |key, value|
|
|
90
|
+
key_str = key.to_s
|
|
91
|
+
next unless key_str.start_with?("attribute_") || key_str.start_with?("attribute-")
|
|
92
|
+
|
|
93
|
+
attr_name = key_str.sub(/^attribute[-_]/, "")
|
|
94
|
+
attributes[attr_name] = value
|
|
95
|
+
end
|
|
96
|
+
attributes.empty? ? nil : attributes
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def extract_attributes_from_object(obj)
|
|
100
|
+
return nil unless obj.respond_to?(:attributes) && obj.attributes.is_a?(Hash)
|
|
101
|
+
|
|
102
|
+
obj.attributes.empty? ? nil : obj.attributes
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
def extract_properties(hash, property_names)
|
|
106
|
+
return nil if property_names.empty?
|
|
107
|
+
|
|
108
|
+
properties = {}
|
|
109
|
+
property_names.each do |prop_name|
|
|
110
|
+
value = hash[prop_name.to_sym] || hash[prop_name.to_s]
|
|
111
|
+
properties[prop_name.to_s] = value
|
|
112
|
+
end
|
|
113
|
+
properties.values.all?(&:nil?) ? nil : properties
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
def extract_properties_from_object(obj, property_names)
|
|
117
|
+
return nil if property_names.empty?
|
|
118
|
+
|
|
119
|
+
properties = {}
|
|
120
|
+
property_names.each do |prop_name|
|
|
121
|
+
value = obj.respond_to?(prop_name) ? obj.public_send(prop_name) : nil
|
|
122
|
+
properties[prop_name.to_s] = value
|
|
123
|
+
end
|
|
124
|
+
properties.values.all?(&:nil?) ? nil : properties
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
end
|
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "json_schemer"
|
|
5
|
+
|
|
6
|
+
module Broadlistening
|
|
7
|
+
# Compatibility utilities for comparing outputs between
|
|
8
|
+
# Kouchou-AI (Python) and Broadlistening gem (Ruby) implementations.
|
|
9
|
+
#
|
|
10
|
+
# @example Compare two output files
|
|
11
|
+
# report = Compatibility.compare_outputs(
|
|
12
|
+
# python_output: "path/to/python/result.json",
|
|
13
|
+
# ruby_output: "path/to/ruby/result.json"
|
|
14
|
+
# )
|
|
15
|
+
# puts report.summary
|
|
16
|
+
#
|
|
17
|
+
# @example Validate output against schema
|
|
18
|
+
# errors = Compatibility.validate_output(result_hash)
|
|
19
|
+
module Compatibility
|
|
20
|
+
# Expected structure for hierarchical_result.json
|
|
21
|
+
REQUIRED_TOP_LEVEL_KEYS = %w[arguments clusters comments propertyMap translations overview config].freeze
|
|
22
|
+
REQUIRED_ARGUMENT_KEYS = %w[arg_id argument comment_id x y cluster_ids].freeze
|
|
23
|
+
REQUIRED_CLUSTER_KEYS = %w[level id label takeaway value parent].freeze
|
|
24
|
+
|
|
25
|
+
# Path to JSON Schema file
|
|
26
|
+
SCHEMA_PATH = File.expand_path("../../schema/hierarchical_result.json", __dir__)
|
|
27
|
+
|
|
28
|
+
class ComparisonReport
|
|
29
|
+
attr_accessor :differences, :python_stats, :ruby_stats
|
|
30
|
+
|
|
31
|
+
def initialize
|
|
32
|
+
@differences = []
|
|
33
|
+
@python_stats = {}
|
|
34
|
+
@ruby_stats = {}
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def add_difference(category, message, details = {})
|
|
38
|
+
@differences << {
|
|
39
|
+
category: category,
|
|
40
|
+
message: message,
|
|
41
|
+
details: details
|
|
42
|
+
}
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def compatible?
|
|
46
|
+
@differences.empty?
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def summary
|
|
50
|
+
lines = []
|
|
51
|
+
lines << "=" * 60
|
|
52
|
+
lines << "Compatibility Report"
|
|
53
|
+
lines << "=" * 60
|
|
54
|
+
lines << ""
|
|
55
|
+
|
|
56
|
+
lines << "Python Output Stats:"
|
|
57
|
+
@python_stats.each { |k, v| lines << " #{k}: #{v}" }
|
|
58
|
+
lines << ""
|
|
59
|
+
|
|
60
|
+
lines << "Ruby Output Stats:"
|
|
61
|
+
@ruby_stats.each { |k, v| lines << " #{k}: #{v}" }
|
|
62
|
+
lines << ""
|
|
63
|
+
|
|
64
|
+
if compatible?
|
|
65
|
+
lines << "Result: COMPATIBLE"
|
|
66
|
+
else
|
|
67
|
+
lines << "Result: INCOMPATIBLE (#{@differences.size} differences found)"
|
|
68
|
+
lines << ""
|
|
69
|
+
lines << "Differences:"
|
|
70
|
+
@differences.each_with_index do |diff, i|
|
|
71
|
+
lines << " #{i + 1}. [#{diff[:category]}] #{diff[:message]}"
|
|
72
|
+
diff[:details].each { |k, v| lines << " #{k}: #{v}" } if diff[:details].any?
|
|
73
|
+
end
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
lines << ""
|
|
77
|
+
lines << "=" * 60
|
|
78
|
+
lines.join("\n")
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
def to_h
|
|
82
|
+
{
|
|
83
|
+
compatible: compatible?,
|
|
84
|
+
python_stats: @python_stats,
|
|
85
|
+
ruby_stats: @ruby_stats,
|
|
86
|
+
differences: @differences
|
|
87
|
+
}
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
class << self
|
|
92
|
+
# Compare outputs from Python and Ruby implementations
|
|
93
|
+
#
|
|
94
|
+
# @param python_output [String, Hash] Path to JSON file or parsed hash
|
|
95
|
+
# @param ruby_output [String, Hash] Path to JSON file or parsed hash
|
|
96
|
+
# @return [ComparisonReport]
|
|
97
|
+
def compare_outputs(python_output:, ruby_output:)
|
|
98
|
+
python_data = load_output(python_output)
|
|
99
|
+
ruby_data = load_output(ruby_output)
|
|
100
|
+
|
|
101
|
+
report = ComparisonReport.new
|
|
102
|
+
report.python_stats = collect_stats(python_data)
|
|
103
|
+
report.ruby_stats = collect_stats(ruby_data)
|
|
104
|
+
|
|
105
|
+
compare_structure(python_data, ruby_data, report)
|
|
106
|
+
compare_arguments(python_data, ruby_data, report)
|
|
107
|
+
compare_clusters(python_data, ruby_data, report)
|
|
108
|
+
compare_overview(python_data, ruby_data, report)
|
|
109
|
+
|
|
110
|
+
report
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Validate output structure
|
|
114
|
+
#
|
|
115
|
+
# @param output [Hash] Parsed output hash
|
|
116
|
+
# @return [Array<String>] List of validation errors
|
|
117
|
+
def validate_output(output)
|
|
118
|
+
errors = []
|
|
119
|
+
|
|
120
|
+
# Check top-level keys
|
|
121
|
+
missing_keys = REQUIRED_TOP_LEVEL_KEYS - output.keys.map(&:to_s)
|
|
122
|
+
errors << "Missing top-level keys: #{missing_keys.join(', ')}" if missing_keys.any?
|
|
123
|
+
|
|
124
|
+
# Check arguments structure
|
|
125
|
+
if output["arguments"] || output[:arguments]
|
|
126
|
+
args = output["arguments"] || output[:arguments]
|
|
127
|
+
if args.is_a?(Array) && args.any?
|
|
128
|
+
sample = args.first
|
|
129
|
+
sample_keys = sample.keys.map(&:to_s)
|
|
130
|
+
missing_arg_keys = REQUIRED_ARGUMENT_KEYS - sample_keys
|
|
131
|
+
errors << "Missing argument keys: #{missing_arg_keys.join(', ')}" if missing_arg_keys.any?
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Check clusters structure
|
|
136
|
+
if output["clusters"] || output[:clusters]
|
|
137
|
+
clusters = output["clusters"] || output[:clusters]
|
|
138
|
+
if clusters.is_a?(Array) && clusters.any?
|
|
139
|
+
sample = clusters.first
|
|
140
|
+
sample_keys = sample.keys.map(&:to_s)
|
|
141
|
+
missing_cluster_keys = REQUIRED_CLUSTER_KEYS - sample_keys
|
|
142
|
+
errors << "Missing cluster keys: #{missing_cluster_keys.join(', ')}" if missing_cluster_keys.any?
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
errors
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Check if output is structurally compatible with Kouchou-AI format
|
|
150
|
+
#
|
|
151
|
+
# @param output [Hash] Parsed output hash
|
|
152
|
+
# @return [Boolean]
|
|
153
|
+
def valid_output?(output)
|
|
154
|
+
validate_output(output).empty?
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# Validate output against JSON Schema
|
|
158
|
+
#
|
|
159
|
+
# @param output [Hash, String] Parsed output hash or path to JSON file
|
|
160
|
+
# @return [Array<Hash>] List of validation errors from JSON Schema
|
|
161
|
+
def validate_with_schema(output)
|
|
162
|
+
data = output.is_a?(String) ? JSON.parse(File.read(output)) : output
|
|
163
|
+
data = deep_stringify_keys(data) if data.is_a?(Hash)
|
|
164
|
+
|
|
165
|
+
schema = JSONSchemer.schema(Pathname.new(SCHEMA_PATH))
|
|
166
|
+
errors = schema.validate(data).to_a
|
|
167
|
+
|
|
168
|
+
errors.map do |error|
|
|
169
|
+
{
|
|
170
|
+
path: error["data_pointer"],
|
|
171
|
+
message: error["error"],
|
|
172
|
+
details: error["details"] || {}
|
|
173
|
+
}
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
# Check if output is valid according to JSON Schema
|
|
178
|
+
#
|
|
179
|
+
# @param output [Hash, String] Parsed output hash or path to JSON file
|
|
180
|
+
# @return [Boolean]
|
|
181
|
+
def valid_schema?(output)
|
|
182
|
+
validate_with_schema(output).empty?
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# Get the JSON Schema as a Hash
|
|
186
|
+
#
|
|
187
|
+
# @return [Hash] The JSON Schema
|
|
188
|
+
def schema
|
|
189
|
+
@schema ||= JSON.parse(File.read(SCHEMA_PATH))
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Get the path to the JSON Schema file
|
|
193
|
+
#
|
|
194
|
+
# @return [String] Path to schema file
|
|
195
|
+
def schema_path
|
|
196
|
+
SCHEMA_PATH
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
private
|
|
200
|
+
|
|
201
|
+
def load_output(output)
|
|
202
|
+
case output
|
|
203
|
+
when String
|
|
204
|
+
JSON.parse(File.read(output))
|
|
205
|
+
when Hash
|
|
206
|
+
deep_stringify_keys(output)
|
|
207
|
+
else
|
|
208
|
+
raise ArgumentError, "Output must be a file path (String) or Hash"
|
|
209
|
+
end
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
def deep_stringify_keys(hash)
|
|
213
|
+
hash.transform_keys(&:to_s).transform_values do |v|
|
|
214
|
+
case v
|
|
215
|
+
when Hash then deep_stringify_keys(v)
|
|
216
|
+
when Array then v.map { |e| e.is_a?(Hash) ? deep_stringify_keys(e) : e }
|
|
217
|
+
else v
|
|
218
|
+
end
|
|
219
|
+
end
|
|
220
|
+
end
|
|
221
|
+
|
|
222
|
+
def collect_stats(data)
|
|
223
|
+
{
|
|
224
|
+
argument_count: data["arguments"]&.size || 0,
|
|
225
|
+
cluster_count: data["clusters"]&.size || 0,
|
|
226
|
+
cluster_levels: data["clusters"]&.map { |c| c["level"] }&.uniq&.sort || [],
|
|
227
|
+
has_overview: !data["overview"].to_s.strip.empty?,
|
|
228
|
+
has_property_map: data["propertyMap"]&.any? || false,
|
|
229
|
+
top_level_keys: data.keys.sort
|
|
230
|
+
}
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
def compare_structure(python, ruby, report)
|
|
234
|
+
python_keys = python.keys.sort
|
|
235
|
+
ruby_keys = ruby.keys.sort
|
|
236
|
+
|
|
237
|
+
missing_in_ruby = python_keys - ruby_keys
|
|
238
|
+
extra_in_ruby = ruby_keys - python_keys
|
|
239
|
+
|
|
240
|
+
if missing_in_ruby.any?
|
|
241
|
+
report.add_difference(
|
|
242
|
+
:structure,
|
|
243
|
+
"Missing top-level keys in Ruby output",
|
|
244
|
+
missing: missing_in_ruby
|
|
245
|
+
)
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
if extra_in_ruby.any?
|
|
249
|
+
report.add_difference(
|
|
250
|
+
:structure,
|
|
251
|
+
"Extra top-level keys in Ruby output",
|
|
252
|
+
extra: extra_in_ruby
|
|
253
|
+
)
|
|
254
|
+
end
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
def compare_arguments(python, ruby, report)
|
|
258
|
+
python_args = python["arguments"] || []
|
|
259
|
+
ruby_args = ruby["arguments"] || []
|
|
260
|
+
|
|
261
|
+
# Compare argument structure (keys)
|
|
262
|
+
if python_args.any? && ruby_args.any?
|
|
263
|
+
python_keys = python_args.first.keys.sort
|
|
264
|
+
ruby_keys = ruby_args.first.keys.sort
|
|
265
|
+
|
|
266
|
+
missing_keys = python_keys - ruby_keys
|
|
267
|
+
if missing_keys.any?
|
|
268
|
+
report.add_difference(
|
|
269
|
+
:arguments,
|
|
270
|
+
"Missing argument keys in Ruby output",
|
|
271
|
+
missing: missing_keys
|
|
272
|
+
)
|
|
273
|
+
end
|
|
274
|
+
end
|
|
275
|
+
|
|
276
|
+
# Compare cluster_ids format
|
|
277
|
+
if python_args.any? && ruby_args.any?
|
|
278
|
+
python_cluster_ids = python_args.first["cluster_ids"]
|
|
279
|
+
ruby_cluster_ids = ruby_args.first["cluster_ids"]
|
|
280
|
+
|
|
281
|
+
if python_cluster_ids.is_a?(Array) && ruby_cluster_ids.is_a?(Array)
|
|
282
|
+
# Check format consistency (e.g., "0", "1_5", "2_10")
|
|
283
|
+
python_format = detect_cluster_id_format(python_cluster_ids)
|
|
284
|
+
ruby_format = detect_cluster_id_format(ruby_cluster_ids)
|
|
285
|
+
|
|
286
|
+
if python_format != ruby_format
|
|
287
|
+
report.add_difference(
|
|
288
|
+
:arguments,
|
|
289
|
+
"cluster_ids format mismatch",
|
|
290
|
+
python_format: python_format,
|
|
291
|
+
ruby_format: ruby_format
|
|
292
|
+
)
|
|
293
|
+
end
|
|
294
|
+
end
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
def compare_clusters(python, ruby, report)
|
|
299
|
+
python_clusters = python["clusters"] || []
|
|
300
|
+
ruby_clusters = ruby["clusters"] || []
|
|
301
|
+
|
|
302
|
+
# Compare cluster structure
|
|
303
|
+
if python_clusters.any? && ruby_clusters.any?
|
|
304
|
+
python_keys = python_clusters.first.keys.sort
|
|
305
|
+
ruby_keys = ruby_clusters.first.keys.sort
|
|
306
|
+
|
|
307
|
+
missing_keys = python_keys - ruby_keys
|
|
308
|
+
if missing_keys.any?
|
|
309
|
+
report.add_difference(
|
|
310
|
+
:clusters,
|
|
311
|
+
"Missing cluster keys in Ruby output",
|
|
312
|
+
missing: missing_keys
|
|
313
|
+
)
|
|
314
|
+
end
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Compare hierarchy levels
|
|
318
|
+
python_levels = python_clusters.map { |c| c["level"] }.uniq.sort
|
|
319
|
+
ruby_levels = ruby_clusters.map { |c| c["level"] }.uniq.sort
|
|
320
|
+
|
|
321
|
+
if python_levels != ruby_levels
|
|
322
|
+
report.add_difference(
|
|
323
|
+
:clusters,
|
|
324
|
+
"Cluster hierarchy levels differ",
|
|
325
|
+
python_levels: python_levels,
|
|
326
|
+
ruby_levels: ruby_levels
|
|
327
|
+
)
|
|
328
|
+
end
|
|
329
|
+
|
|
330
|
+
# Compare root cluster
|
|
331
|
+
python_root = python_clusters.find { |c| c["level"] == 0 }
|
|
332
|
+
ruby_root = ruby_clusters.find { |c| c["level"] == 0 }
|
|
333
|
+
|
|
334
|
+
if python_root && ruby_root
|
|
335
|
+
if python_root["id"] != ruby_root["id"]
|
|
336
|
+
report.add_difference(
|
|
337
|
+
:clusters,
|
|
338
|
+
"Root cluster ID differs",
|
|
339
|
+
python_id: python_root["id"],
|
|
340
|
+
ruby_id: ruby_root["id"]
|
|
341
|
+
)
|
|
342
|
+
end
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
def compare_overview(python, ruby, report)
|
|
347
|
+
python_overview = python["overview"].to_s.strip
|
|
348
|
+
ruby_overview = ruby["overview"].to_s.strip
|
|
349
|
+
|
|
350
|
+
if python_overview.empty? != ruby_overview.empty?
|
|
351
|
+
report.add_difference(
|
|
352
|
+
:overview,
|
|
353
|
+
"Overview presence differs",
|
|
354
|
+
python_has_overview: !python_overview.empty?,
|
|
355
|
+
ruby_has_overview: !ruby_overview.empty?
|
|
356
|
+
)
|
|
357
|
+
end
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def detect_cluster_id_format(cluster_ids)
|
|
361
|
+
return :empty if cluster_ids.empty?
|
|
362
|
+
|
|
363
|
+
formats = cluster_ids.map do |id|
|
|
364
|
+
case id.to_s
|
|
365
|
+
when /^\d+$/ then :numeric
|
|
366
|
+
when /^\d+_\d+$/ then :level_index
|
|
367
|
+
else :other
|
|
368
|
+
end
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
formats.uniq.size == 1 ? formats.first : :mixed
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
end
|
|
375
|
+
end
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Broadlistening
|
|
6
|
+
class Config
|
|
7
|
+
attr_reader :model, :embedding_model, :provider, :cluster_nums, :workers, :prompts, :api_key,
|
|
8
|
+
:enable_source_link, :hidden_properties, :is_pubcom,
|
|
9
|
+
:api_base_url, :local_llm_address, :azure_api_version,
|
|
10
|
+
:input, :question, :name, :intro
|
|
11
|
+
|
|
12
|
+
DEFAULT_CLUSTER_NUMS = [ 5, 15 ].freeze
|
|
13
|
+
DEFAULT_WORKERS = 10
|
|
14
|
+
DEFAULT_AZURE_API_VERSION = "2024-02-15-preview"
|
|
15
|
+
|
|
16
|
+
def self.from_json(json_string)
|
|
17
|
+
data = JSON.parse(json_string, symbolize_names: true)
|
|
18
|
+
from_hash(data)
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def self.from_hash(hash)
|
|
22
|
+
prompts = hash[:prompts]&.transform_keys(&:to_sym)
|
|
23
|
+
|
|
24
|
+
cluster_nums = hash[:cluster_nums] || hash.dig(:hierarchical_clustering, :cluster_nums)
|
|
25
|
+
workers = hash[:workers] || hash.dig(:extraction, :workers)
|
|
26
|
+
hidden_properties = hash[:hidden_properties] || hash.dig(:aggregation, :hidden_properties)
|
|
27
|
+
|
|
28
|
+
new(
|
|
29
|
+
api_key: hash[:api_key],
|
|
30
|
+
model: hash[:model],
|
|
31
|
+
embedding_model: hash[:embedding_model],
|
|
32
|
+
provider: hash[:provider],
|
|
33
|
+
cluster_nums: cluster_nums,
|
|
34
|
+
workers: workers,
|
|
35
|
+
prompts: prompts,
|
|
36
|
+
enable_source_link: hash[:enable_source_link],
|
|
37
|
+
hidden_properties: hidden_properties,
|
|
38
|
+
is_pubcom: hash[:is_pubcom],
|
|
39
|
+
api_base_url: hash[:api_base_url],
|
|
40
|
+
local_llm_address: hash[:local_llm_address],
|
|
41
|
+
azure_api_version: hash[:azure_api_version],
|
|
42
|
+
input: hash[:input],
|
|
43
|
+
question: hash[:question],
|
|
44
|
+
name: hash[:name],
|
|
45
|
+
intro: hash[:intro]
|
|
46
|
+
)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
def self.from_file(path)
|
|
50
|
+
from_json(File.read(path))
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
def initialize(options = {})
|
|
54
|
+
@local_llm_address = options[:local_llm_address] || ENV.fetch("LOCAL_LLM_ADDRESS", "localhost:11434")
|
|
55
|
+
@provider_obj = Provider.new(
|
|
56
|
+
options[:provider]&.to_sym || :openai,
|
|
57
|
+
local_llm_address: @local_llm_address
|
|
58
|
+
)
|
|
59
|
+
@provider = @provider_obj.name
|
|
60
|
+
@model = options[:model] || @provider_obj.default_model
|
|
61
|
+
@embedding_model = options[:embedding_model] || @provider_obj.default_embedding_model
|
|
62
|
+
@cluster_nums = options[:cluster_nums] || DEFAULT_CLUSTER_NUMS.dup
|
|
63
|
+
@workers = options[:workers] || DEFAULT_WORKERS
|
|
64
|
+
@prompts = default_prompts.merge(options[:prompts] || {})
|
|
65
|
+
@api_key = options[:api_key] || @provider_obj.api_key
|
|
66
|
+
@enable_source_link = options.fetch(:enable_source_link, false)
|
|
67
|
+
@hidden_properties = options.fetch(:hidden_properties, {}) || {}
|
|
68
|
+
@is_pubcom = options.fetch(:is_pubcom, false)
|
|
69
|
+
@api_base_url = options[:api_base_url] || @provider_obj.base_url
|
|
70
|
+
@azure_api_version = options[:azure_api_version] || ENV.fetch("AZURE_API_VERSION", DEFAULT_AZURE_API_VERSION)
|
|
71
|
+
@input = options[:input]
|
|
72
|
+
@question = options[:question]
|
|
73
|
+
@name = options[:name]
|
|
74
|
+
@intro = options[:intro]
|
|
75
|
+
|
|
76
|
+
validate!
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def to_h
|
|
80
|
+
{
|
|
81
|
+
model: model,
|
|
82
|
+
embedding_model: embedding_model,
|
|
83
|
+
provider: provider,
|
|
84
|
+
cluster_nums: cluster_nums,
|
|
85
|
+
workers: workers,
|
|
86
|
+
enable_source_link: enable_source_link,
|
|
87
|
+
hidden_properties: hidden_properties,
|
|
88
|
+
is_pubcom: is_pubcom,
|
|
89
|
+
api_base_url: api_base_url,
|
|
90
|
+
local_llm_address: local_llm_address,
|
|
91
|
+
azure_api_version: azure_api_version,
|
|
92
|
+
input: input,
|
|
93
|
+
question: question,
|
|
94
|
+
name: name,
|
|
95
|
+
intro: intro
|
|
96
|
+
}.compact
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def to_json(*args)
|
|
100
|
+
to_h.merge(prompts: prompts).to_json(*args)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def save_to_file(path)
|
|
104
|
+
File.write(path, JSON.pretty_generate(to_h.merge(prompts: prompts)))
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def property_names
|
|
108
|
+
hidden_properties.keys
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
private
|
|
112
|
+
|
|
113
|
+
def validate!
|
|
114
|
+
if @provider_obj.requires_api_key? && (api_key.nil? || api_key.empty?)
|
|
115
|
+
raise ConfigurationError, "API key is required"
|
|
116
|
+
end
|
|
117
|
+
if @provider_obj.requires_base_url? && (api_base_url.nil? || api_base_url.empty?)
|
|
118
|
+
raise ConfigurationError, "Azure requires api_base_url"
|
|
119
|
+
end
|
|
120
|
+
raise ConfigurationError, "cluster_nums must have at least 2 levels" if cluster_nums.size < 2
|
|
121
|
+
raise ConfigurationError, "cluster_nums must be sorted ascending" unless cluster_nums == cluster_nums.sort
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
def default_prompts
|
|
125
|
+
{
|
|
126
|
+
extraction: extraction_prompt,
|
|
127
|
+
initial_labelling: initial_labelling_prompt,
|
|
128
|
+
merge_labelling: merge_labelling_prompt,
|
|
129
|
+
overview: overview_prompt
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def extraction_prompt
|
|
134
|
+
<<~PROMPT
|
|
135
|
+
あなたは意見抽出の専門家です。
|
|
136
|
+
以下のコメントから、主要な意見や主張を抽出してください。
|
|
137
|
+
1つのコメントに複数の意見が含まれる場合は、それぞれを別々に抽出してください。
|
|
138
|
+
抽出した意見はJSON形式で返してください。
|
|
139
|
+
|
|
140
|
+
出力フォーマット:
|
|
141
|
+
{"extractedOpinionList": ["意見1", "意見2", ...]}
|
|
142
|
+
|
|
143
|
+
注意:
|
|
144
|
+
- 事実の記述ではなく、意見や主張を抽出してください
|
|
145
|
+
- 曖昧な表現は具体的に言い換えてください
|
|
146
|
+
- 重複する意見は1つにまとめてください
|
|
147
|
+
PROMPT
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def initial_labelling_prompt
|
|
151
|
+
<<~PROMPT
|
|
152
|
+
あなたはクラスタ分析の専門家です。
|
|
153
|
+
以下の意見グループに対して、適切なラベルと説明を付けてください。
|
|
154
|
+
|
|
155
|
+
出力フォーマット:
|
|
156
|
+
{"label": "ラベル名", "description": "このグループの説明"}
|
|
157
|
+
|
|
158
|
+
注意:
|
|
159
|
+
- ラベルは簡潔で分かりやすいものにしてください(10文字以内推奨)
|
|
160
|
+
- 説明はグループの特徴を端的に表してください(50文字以内推奨)
|
|
161
|
+
PROMPT
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def merge_labelling_prompt
|
|
165
|
+
<<~PROMPT
|
|
166
|
+
あなたはクラスタ分析の専門家です。
|
|
167
|
+
以下の子クラスタのラベルと説明を統合し、親クラスタのラベルと説明を作成してください。
|
|
168
|
+
|
|
169
|
+
出力フォーマット:
|
|
170
|
+
{"label": "ラベル名", "description": "このグループの説明"}
|
|
171
|
+
|
|
172
|
+
注意:
|
|
173
|
+
- 親ラベルは子ラベルの共通テーマを表すものにしてください
|
|
174
|
+
- 抽象度を上げすぎず、具体性を保ってください
|
|
175
|
+
PROMPT
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def overview_prompt
|
|
179
|
+
<<~PROMPT
|
|
180
|
+
あなたは分析レポートの専門家です。
|
|
181
|
+
以下のクラスタ分析結果に基づいて、全体の概要を作成してください。
|
|
182
|
+
|
|
183
|
+
注意:
|
|
184
|
+
- 主要なテーマや傾向を簡潔にまとめてください
|
|
185
|
+
- 200-300文字程度で記述してください
|
|
186
|
+
- 客観的な記述を心がけてください
|
|
187
|
+
PROMPT
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
end
|