broadlistening 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +3 -0
- data/CHANGELOG.md +40 -0
- data/CLAUDE.md +112 -0
- data/LICENSE +24 -0
- data/LICENSE-AGPLv3.txt +661 -0
- data/README.md +195 -0
- data/Rakefile +77 -0
- data/exe/broadlistening +6 -0
- data/lib/broadlistening/argument.rb +136 -0
- data/lib/broadlistening/cli.rb +196 -0
- data/lib/broadlistening/comment.rb +128 -0
- data/lib/broadlistening/compatibility.rb +375 -0
- data/lib/broadlistening/config.rb +190 -0
- data/lib/broadlistening/context.rb +180 -0
- data/lib/broadlistening/csv_loader.rb +109 -0
- data/lib/broadlistening/hierarchical_clustering.rb +142 -0
- data/lib/broadlistening/kmeans.rb +185 -0
- data/lib/broadlistening/llm_client.rb +84 -0
- data/lib/broadlistening/pipeline.rb +129 -0
- data/lib/broadlistening/planner.rb +114 -0
- data/lib/broadlistening/provider.rb +97 -0
- data/lib/broadlistening/spec_loader.rb +86 -0
- data/lib/broadlistening/status.rb +132 -0
- data/lib/broadlistening/steps/aggregation.rb +228 -0
- data/lib/broadlistening/steps/base_step.rb +42 -0
- data/lib/broadlistening/steps/clustering.rb +103 -0
- data/lib/broadlistening/steps/embedding.rb +40 -0
- data/lib/broadlistening/steps/extraction.rb +73 -0
- data/lib/broadlistening/steps/initial_labelling.rb +85 -0
- data/lib/broadlistening/steps/merge_labelling.rb +93 -0
- data/lib/broadlistening/steps/overview.rb +36 -0
- data/lib/broadlistening/version.rb +5 -0
- data/lib/broadlistening.rb +44 -0
- data/schema/hierarchical_result.json +152 -0
- data/sig/broadlistening.rbs +4 -0
- metadata +194 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "active_support"
|
|
4
|
+
require "active_support/core_ext/string/inflections"
|
|
5
|
+
require "numo/narray"
|
|
6
|
+
require "openai"
|
|
7
|
+
require "parallel"
|
|
8
|
+
require "json"
|
|
9
|
+
require "umappp"
|
|
10
|
+
|
|
11
|
+
require_relative "broadlistening/version"
|
|
12
|
+
require_relative "broadlistening/provider"
|
|
13
|
+
require_relative "broadlistening/config"
|
|
14
|
+
require_relative "broadlistening/spec_loader"
|
|
15
|
+
require_relative "broadlistening/status"
|
|
16
|
+
require_relative "broadlistening/planner"
|
|
17
|
+
require_relative "broadlistening/comment"
|
|
18
|
+
require_relative "broadlistening/argument"
|
|
19
|
+
require_relative "broadlistening/csv_loader"
|
|
20
|
+
require_relative "broadlistening/compatibility"
|
|
21
|
+
require_relative "broadlistening/context"
|
|
22
|
+
require_relative "broadlistening/pipeline"
|
|
23
|
+
require_relative "broadlistening/cli"
|
|
24
|
+
|
|
25
|
+
require_relative "broadlistening/llm_client"
|
|
26
|
+
require_relative "broadlistening/kmeans"
|
|
27
|
+
require_relative "broadlistening/hierarchical_clustering"
|
|
28
|
+
|
|
29
|
+
# Steps
|
|
30
|
+
require_relative "broadlistening/steps/base_step"
|
|
31
|
+
require_relative "broadlistening/steps/extraction"
|
|
32
|
+
require_relative "broadlistening/steps/embedding"
|
|
33
|
+
require_relative "broadlistening/steps/clustering"
|
|
34
|
+
require_relative "broadlistening/steps/initial_labelling"
|
|
35
|
+
require_relative "broadlistening/steps/merge_labelling"
|
|
36
|
+
require_relative "broadlistening/steps/overview"
|
|
37
|
+
require_relative "broadlistening/steps/aggregation"
|
|
38
|
+
|
|
39
|
+
module Broadlistening
|
|
40
|
+
class Error < StandardError; end
|
|
41
|
+
class ConfigurationError < Error; end
|
|
42
|
+
class LlmError < Error; end
|
|
43
|
+
class ClusteringError < Error; end
|
|
44
|
+
end
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://github.com/takahashim/broadlistening-ruby/blob/main/schema/hierarchical_result.json",
|
|
4
|
+
"title": "Hierarchical Result",
|
|
5
|
+
"description": "Output schema for Kouchou-AI / Broadlistening pipeline results. This schema defines the structure of hierarchical_result.json produced by both Python and Ruby implementations.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["arguments", "clusters", "comments", "propertyMap", "translations", "overview", "config"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"arguments": {
|
|
10
|
+
"type": "array",
|
|
11
|
+
"description": "List of extracted arguments/opinions from comments",
|
|
12
|
+
"items": {
|
|
13
|
+
"$ref": "#/definitions/argument"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"clusters": {
|
|
17
|
+
"type": "array",
|
|
18
|
+
"description": "Hierarchical cluster definitions",
|
|
19
|
+
"items": {
|
|
20
|
+
"$ref": "#/definitions/cluster"
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"comments": {
|
|
24
|
+
"type": "object",
|
|
25
|
+
"description": "Map of comment_id to comment data",
|
|
26
|
+
"additionalProperties": {
|
|
27
|
+
"type": "object",
|
|
28
|
+
"properties": {
|
|
29
|
+
"comment": {
|
|
30
|
+
"type": "string",
|
|
31
|
+
"description": "Original comment text"
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"propertyMap": {
|
|
37
|
+
"type": "object",
|
|
38
|
+
"description": "Map of property names to argument property values",
|
|
39
|
+
"additionalProperties": {
|
|
40
|
+
"type": "object",
|
|
41
|
+
"additionalProperties": true
|
|
42
|
+
}
|
|
43
|
+
},
|
|
44
|
+
"translations": {
|
|
45
|
+
"type": "object",
|
|
46
|
+
"description": "Translation data (reserved for future use)",
|
|
47
|
+
"additionalProperties": true
|
|
48
|
+
},
|
|
49
|
+
"overview": {
|
|
50
|
+
"type": "string",
|
|
51
|
+
"description": "LLM-generated overview summarizing all clusters"
|
|
52
|
+
},
|
|
53
|
+
"config": {
|
|
54
|
+
"type": "object",
|
|
55
|
+
"description": "Pipeline configuration used to generate this result",
|
|
56
|
+
"additionalProperties": true
|
|
57
|
+
},
|
|
58
|
+
"comment_num": {
|
|
59
|
+
"type": "integer",
|
|
60
|
+
"description": "Total number of input comments (optional)"
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
"definitions": {
|
|
64
|
+
"argument": {
|
|
65
|
+
"type": "object",
|
|
66
|
+
"description": "An extracted argument/opinion from a comment",
|
|
67
|
+
"required": ["arg_id", "argument", "comment_id", "x", "y", "cluster_ids"],
|
|
68
|
+
"properties": {
|
|
69
|
+
"arg_id": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"description": "Unique argument identifier in format A{comment_id}_{index}",
|
|
72
|
+
"pattern": "^A\\d+_\\d+$"
|
|
73
|
+
},
|
|
74
|
+
"argument": {
|
|
75
|
+
"type": "string",
|
|
76
|
+
"description": "The extracted argument/opinion text"
|
|
77
|
+
},
|
|
78
|
+
"comment_id": {
|
|
79
|
+
"type": ["integer", "string"],
|
|
80
|
+
"description": "ID of the source comment"
|
|
81
|
+
},
|
|
82
|
+
"x": {
|
|
83
|
+
"type": "number",
|
|
84
|
+
"description": "UMAP x-coordinate for visualization"
|
|
85
|
+
},
|
|
86
|
+
"y": {
|
|
87
|
+
"type": "number",
|
|
88
|
+
"description": "UMAP y-coordinate for visualization"
|
|
89
|
+
},
|
|
90
|
+
"p": {
|
|
91
|
+
"type": ["integer", "number"],
|
|
92
|
+
"description": "Priority or ordering value (optional)"
|
|
93
|
+
},
|
|
94
|
+
"cluster_ids": {
|
|
95
|
+
"type": "array",
|
|
96
|
+
"description": "List of cluster IDs this argument belongs to (from root to leaf)",
|
|
97
|
+
"items": {
|
|
98
|
+
"type": "string"
|
|
99
|
+
},
|
|
100
|
+
"minItems": 1
|
|
101
|
+
},
|
|
102
|
+
"attributes": {
|
|
103
|
+
"type": ["object", "null"],
|
|
104
|
+
"description": "Additional attributes extracted from source comment",
|
|
105
|
+
"additionalProperties": true
|
|
106
|
+
},
|
|
107
|
+
"url": {
|
|
108
|
+
"type": ["string", "null"],
|
|
109
|
+
"description": "Source URL of the original comment"
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
},
|
|
113
|
+
"cluster": {
|
|
114
|
+
"type": "object",
|
|
115
|
+
"description": "A cluster in the hierarchy",
|
|
116
|
+
"required": ["level", "id", "label", "takeaway", "value", "parent"],
|
|
117
|
+
"properties": {
|
|
118
|
+
"level": {
|
|
119
|
+
"type": "integer",
|
|
120
|
+
"description": "Hierarchy level (0 = root)",
|
|
121
|
+
"minimum": 0
|
|
122
|
+
},
|
|
123
|
+
"id": {
|
|
124
|
+
"type": "string",
|
|
125
|
+
"description": "Unique cluster identifier. Root is '0', others are '{level}_{index}'",
|
|
126
|
+
"pattern": "^(0|\\d+_\\d+)$"
|
|
127
|
+
},
|
|
128
|
+
"label": {
|
|
129
|
+
"type": "string",
|
|
130
|
+
"description": "LLM-generated label for this cluster"
|
|
131
|
+
},
|
|
132
|
+
"takeaway": {
|
|
133
|
+
"type": "string",
|
|
134
|
+
"description": "LLM-generated description/takeaway for this cluster"
|
|
135
|
+
},
|
|
136
|
+
"value": {
|
|
137
|
+
"type": "integer",
|
|
138
|
+
"description": "Number of arguments in this cluster",
|
|
139
|
+
"minimum": 0
|
|
140
|
+
},
|
|
141
|
+
"parent": {
|
|
142
|
+
"type": "string",
|
|
143
|
+
"description": "ID of parent cluster (empty string for root)"
|
|
144
|
+
},
|
|
145
|
+
"density_rank_percentile": {
|
|
146
|
+
"type": ["number", "null"],
|
|
147
|
+
"description": "Density ranking percentile (optional)"
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
metadata
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: broadlistening
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 0.7.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- takahashim
|
|
8
|
+
bindir: exe
|
|
9
|
+
cert_chain: []
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: activesupport
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '7.0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '7.0'
|
|
26
|
+
- !ruby/object:Gem::Dependency
|
|
27
|
+
name: csv
|
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
|
29
|
+
requirements:
|
|
30
|
+
- - ">="
|
|
31
|
+
- !ruby/object:Gem::Version
|
|
32
|
+
version: '3.0'
|
|
33
|
+
type: :runtime
|
|
34
|
+
prerelease: false
|
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
36
|
+
requirements:
|
|
37
|
+
- - ">="
|
|
38
|
+
- !ruby/object:Gem::Version
|
|
39
|
+
version: '3.0'
|
|
40
|
+
- !ruby/object:Gem::Dependency
|
|
41
|
+
name: numo-narray
|
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
|
43
|
+
requirements:
|
|
44
|
+
- - "~>"
|
|
45
|
+
- !ruby/object:Gem::Version
|
|
46
|
+
version: '0.9'
|
|
47
|
+
type: :runtime
|
|
48
|
+
prerelease: false
|
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
50
|
+
requirements:
|
|
51
|
+
- - "~>"
|
|
52
|
+
- !ruby/object:Gem::Version
|
|
53
|
+
version: '0.9'
|
|
54
|
+
- !ruby/object:Gem::Dependency
|
|
55
|
+
name: parallel
|
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
|
57
|
+
requirements:
|
|
58
|
+
- - "~>"
|
|
59
|
+
- !ruby/object:Gem::Version
|
|
60
|
+
version: '1.20'
|
|
61
|
+
type: :runtime
|
|
62
|
+
prerelease: false
|
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
64
|
+
requirements:
|
|
65
|
+
- - "~>"
|
|
66
|
+
- !ruby/object:Gem::Version
|
|
67
|
+
version: '1.20'
|
|
68
|
+
- !ruby/object:Gem::Dependency
|
|
69
|
+
name: rice
|
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
|
71
|
+
requirements:
|
|
72
|
+
- - "~>"
|
|
73
|
+
- !ruby/object:Gem::Version
|
|
74
|
+
version: 4.6.0
|
|
75
|
+
type: :runtime
|
|
76
|
+
prerelease: false
|
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
78
|
+
requirements:
|
|
79
|
+
- - "~>"
|
|
80
|
+
- !ruby/object:Gem::Version
|
|
81
|
+
version: 4.6.0
|
|
82
|
+
- !ruby/object:Gem::Dependency
|
|
83
|
+
name: ruby-openai
|
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
|
85
|
+
requirements:
|
|
86
|
+
- - "~>"
|
|
87
|
+
- !ruby/object:Gem::Version
|
|
88
|
+
version: '7.0'
|
|
89
|
+
type: :runtime
|
|
90
|
+
prerelease: false
|
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
92
|
+
requirements:
|
|
93
|
+
- - "~>"
|
|
94
|
+
- !ruby/object:Gem::Version
|
|
95
|
+
version: '7.0'
|
|
96
|
+
- !ruby/object:Gem::Dependency
|
|
97
|
+
name: umappp
|
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
|
99
|
+
requirements:
|
|
100
|
+
- - "~>"
|
|
101
|
+
- !ruby/object:Gem::Version
|
|
102
|
+
version: '0.2'
|
|
103
|
+
type: :runtime
|
|
104
|
+
prerelease: false
|
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
106
|
+
requirements:
|
|
107
|
+
- - "~>"
|
|
108
|
+
- !ruby/object:Gem::Version
|
|
109
|
+
version: '0.2'
|
|
110
|
+
- !ruby/object:Gem::Dependency
|
|
111
|
+
name: json_schemer
|
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
|
113
|
+
requirements:
|
|
114
|
+
- - "~>"
|
|
115
|
+
- !ruby/object:Gem::Version
|
|
116
|
+
version: '2.0'
|
|
117
|
+
type: :runtime
|
|
118
|
+
prerelease: false
|
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
120
|
+
requirements:
|
|
121
|
+
- - "~>"
|
|
122
|
+
- !ruby/object:Gem::Version
|
|
123
|
+
version: '2.0'
|
|
124
|
+
description: A Ruby implementation of the Broadlistening pipeline for clustering and
|
|
125
|
+
analyzing public comments using LLM
|
|
126
|
+
email:
|
|
127
|
+
- takahashimm@gmail.com
|
|
128
|
+
executables:
|
|
129
|
+
- broadlistening
|
|
130
|
+
extensions: []
|
|
131
|
+
extra_rdoc_files: []
|
|
132
|
+
files:
|
|
133
|
+
- ".rspec"
|
|
134
|
+
- ".rubocop.yml"
|
|
135
|
+
- CHANGELOG.md
|
|
136
|
+
- CLAUDE.md
|
|
137
|
+
- LICENSE
|
|
138
|
+
- LICENSE-AGPLv3.txt
|
|
139
|
+
- README.md
|
|
140
|
+
- Rakefile
|
|
141
|
+
- exe/broadlistening
|
|
142
|
+
- lib/broadlistening.rb
|
|
143
|
+
- lib/broadlistening/argument.rb
|
|
144
|
+
- lib/broadlistening/cli.rb
|
|
145
|
+
- lib/broadlistening/comment.rb
|
|
146
|
+
- lib/broadlistening/compatibility.rb
|
|
147
|
+
- lib/broadlistening/config.rb
|
|
148
|
+
- lib/broadlistening/context.rb
|
|
149
|
+
- lib/broadlistening/csv_loader.rb
|
|
150
|
+
- lib/broadlistening/hierarchical_clustering.rb
|
|
151
|
+
- lib/broadlistening/kmeans.rb
|
|
152
|
+
- lib/broadlistening/llm_client.rb
|
|
153
|
+
- lib/broadlistening/pipeline.rb
|
|
154
|
+
- lib/broadlistening/planner.rb
|
|
155
|
+
- lib/broadlistening/provider.rb
|
|
156
|
+
- lib/broadlistening/spec_loader.rb
|
|
157
|
+
- lib/broadlistening/status.rb
|
|
158
|
+
- lib/broadlistening/steps/aggregation.rb
|
|
159
|
+
- lib/broadlistening/steps/base_step.rb
|
|
160
|
+
- lib/broadlistening/steps/clustering.rb
|
|
161
|
+
- lib/broadlistening/steps/embedding.rb
|
|
162
|
+
- lib/broadlistening/steps/extraction.rb
|
|
163
|
+
- lib/broadlistening/steps/initial_labelling.rb
|
|
164
|
+
- lib/broadlistening/steps/merge_labelling.rb
|
|
165
|
+
- lib/broadlistening/steps/overview.rb
|
|
166
|
+
- lib/broadlistening/version.rb
|
|
167
|
+
- schema/hierarchical_result.json
|
|
168
|
+
- sig/broadlistening.rbs
|
|
169
|
+
homepage: https://github.com/takahashim/broadlistening-ruby
|
|
170
|
+
licenses:
|
|
171
|
+
- AGPL-3.0
|
|
172
|
+
metadata:
|
|
173
|
+
homepage_uri: https://github.com/takahashim/broadlistening-ruby
|
|
174
|
+
source_code_uri: https://github.com/takahashim/broadlistening-ruby
|
|
175
|
+
changelog_uri: https://github.com/takahashim/broadlistening-ruby/blob/main/CHANGELOG.md
|
|
176
|
+
rubygems_mfa_required: 'true'
|
|
177
|
+
rdoc_options: []
|
|
178
|
+
require_paths:
|
|
179
|
+
- lib
|
|
180
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
181
|
+
requirements:
|
|
182
|
+
- - ">="
|
|
183
|
+
- !ruby/object:Gem::Version
|
|
184
|
+
version: 3.1.0
|
|
185
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
186
|
+
requirements:
|
|
187
|
+
- - ">="
|
|
188
|
+
- !ruby/object:Gem::Version
|
|
189
|
+
version: '0'
|
|
190
|
+
requirements: []
|
|
191
|
+
rubygems_version: 3.6.7
|
|
192
|
+
specification_version: 4
|
|
193
|
+
summary: Broadlistening pipeline for opinion analysis
|
|
194
|
+
test_files: []
|