broadlistening 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +3 -0
  4. data/CHANGELOG.md +40 -0
  5. data/CLAUDE.md +112 -0
  6. data/LICENSE +24 -0
  7. data/LICENSE-AGPLv3.txt +661 -0
  8. data/README.md +195 -0
  9. data/Rakefile +77 -0
  10. data/exe/broadlistening +6 -0
  11. data/lib/broadlistening/argument.rb +136 -0
  12. data/lib/broadlistening/cli.rb +196 -0
  13. data/lib/broadlistening/comment.rb +128 -0
  14. data/lib/broadlistening/compatibility.rb +375 -0
  15. data/lib/broadlistening/config.rb +190 -0
  16. data/lib/broadlistening/context.rb +180 -0
  17. data/lib/broadlistening/csv_loader.rb +109 -0
  18. data/lib/broadlistening/hierarchical_clustering.rb +142 -0
  19. data/lib/broadlistening/kmeans.rb +185 -0
  20. data/lib/broadlistening/llm_client.rb +84 -0
  21. data/lib/broadlistening/pipeline.rb +129 -0
  22. data/lib/broadlistening/planner.rb +114 -0
  23. data/lib/broadlistening/provider.rb +97 -0
  24. data/lib/broadlistening/spec_loader.rb +86 -0
  25. data/lib/broadlistening/status.rb +132 -0
  26. data/lib/broadlistening/steps/aggregation.rb +228 -0
  27. data/lib/broadlistening/steps/base_step.rb +42 -0
  28. data/lib/broadlistening/steps/clustering.rb +103 -0
  29. data/lib/broadlistening/steps/embedding.rb +40 -0
  30. data/lib/broadlistening/steps/extraction.rb +73 -0
  31. data/lib/broadlistening/steps/initial_labelling.rb +85 -0
  32. data/lib/broadlistening/steps/merge_labelling.rb +93 -0
  33. data/lib/broadlistening/steps/overview.rb +36 -0
  34. data/lib/broadlistening/version.rb +5 -0
  35. data/lib/broadlistening.rb +44 -0
  36. data/schema/hierarchical_result.json +152 -0
  37. data/sig/broadlistening.rbs +4 -0
  38. metadata +194 -0
@@ -0,0 +1,44 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support"
4
+ require "active_support/core_ext/string/inflections"
5
+ require "numo/narray"
6
+ require "openai"
7
+ require "parallel"
8
+ require "json"
9
+ require "umappp"
10
+
11
+ require_relative "broadlistening/version"
12
+ require_relative "broadlistening/provider"
13
+ require_relative "broadlistening/config"
14
+ require_relative "broadlistening/spec_loader"
15
+ require_relative "broadlistening/status"
16
+ require_relative "broadlistening/planner"
17
+ require_relative "broadlistening/comment"
18
+ require_relative "broadlistening/argument"
19
+ require_relative "broadlistening/csv_loader"
20
+ require_relative "broadlistening/compatibility"
21
+ require_relative "broadlistening/context"
22
+ require_relative "broadlistening/pipeline"
23
+ require_relative "broadlistening/cli"
24
+
25
+ require_relative "broadlistening/llm_client"
26
+ require_relative "broadlistening/kmeans"
27
+ require_relative "broadlistening/hierarchical_clustering"
28
+
29
+ # Steps
30
+ require_relative "broadlistening/steps/base_step"
31
+ require_relative "broadlistening/steps/extraction"
32
+ require_relative "broadlistening/steps/embedding"
33
+ require_relative "broadlistening/steps/clustering"
34
+ require_relative "broadlistening/steps/initial_labelling"
35
+ require_relative "broadlistening/steps/merge_labelling"
36
+ require_relative "broadlistening/steps/overview"
37
+ require_relative "broadlistening/steps/aggregation"
38
+
39
+ module Broadlistening
40
+ class Error < StandardError; end
41
+ class ConfigurationError < Error; end
42
+ class LlmError < Error; end
43
+ class ClusteringError < Error; end
44
+ end
@@ -0,0 +1,152 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://github.com/takahashim/broadlistening-ruby/blob/main/schema/hierarchical_result.json",
4
+ "title": "Hierarchical Result",
5
+ "description": "Output schema for Kouchou-AI / Broadlistening pipeline results. This schema defines the structure of hierarchical_result.json produced by both Python and Ruby implementations.",
6
+ "type": "object",
7
+ "required": ["arguments", "clusters", "comments", "propertyMap", "translations", "overview", "config"],
8
+ "properties": {
9
+ "arguments": {
10
+ "type": "array",
11
+ "description": "List of extracted arguments/opinions from comments",
12
+ "items": {
13
+ "$ref": "#/definitions/argument"
14
+ }
15
+ },
16
+ "clusters": {
17
+ "type": "array",
18
+ "description": "Hierarchical cluster definitions",
19
+ "items": {
20
+ "$ref": "#/definitions/cluster"
21
+ }
22
+ },
23
+ "comments": {
24
+ "type": "object",
25
+ "description": "Map of comment_id to comment data",
26
+ "additionalProperties": {
27
+ "type": "object",
28
+ "properties": {
29
+ "comment": {
30
+ "type": "string",
31
+ "description": "Original comment text"
32
+ }
33
+ }
34
+ }
35
+ },
36
+ "propertyMap": {
37
+ "type": "object",
38
+ "description": "Map of property names to argument property values",
39
+ "additionalProperties": {
40
+ "type": "object",
41
+ "additionalProperties": true
42
+ }
43
+ },
44
+ "translations": {
45
+ "type": "object",
46
+ "description": "Translation data (reserved for future use)",
47
+ "additionalProperties": true
48
+ },
49
+ "overview": {
50
+ "type": "string",
51
+ "description": "LLM-generated overview summarizing all clusters"
52
+ },
53
+ "config": {
54
+ "type": "object",
55
+ "description": "Pipeline configuration used to generate this result",
56
+ "additionalProperties": true
57
+ },
58
+ "comment_num": {
59
+ "type": "integer",
60
+ "description": "Total number of input comments (optional)"
61
+ }
62
+ },
63
+ "definitions": {
64
+ "argument": {
65
+ "type": "object",
66
+ "description": "An extracted argument/opinion from a comment",
67
+ "required": ["arg_id", "argument", "comment_id", "x", "y", "cluster_ids"],
68
+ "properties": {
69
+ "arg_id": {
70
+ "type": "string",
71
+ "description": "Unique argument identifier in format A{comment_id}_{index}",
72
+ "pattern": "^A\\d+_\\d+$"
73
+ },
74
+ "argument": {
75
+ "type": "string",
76
+ "description": "The extracted argument/opinion text"
77
+ },
78
+ "comment_id": {
79
+ "type": ["integer", "string"],
80
+ "description": "ID of the source comment"
81
+ },
82
+ "x": {
83
+ "type": "number",
84
+ "description": "UMAP x-coordinate for visualization"
85
+ },
86
+ "y": {
87
+ "type": "number",
88
+ "description": "UMAP y-coordinate for visualization"
89
+ },
90
+ "p": {
91
+ "type": ["integer", "number"],
92
+ "description": "Priority or ordering value (optional)"
93
+ },
94
+ "cluster_ids": {
95
+ "type": "array",
96
+ "description": "List of cluster IDs this argument belongs to (from root to leaf)",
97
+ "items": {
98
+ "type": "string"
99
+ },
100
+ "minItems": 1
101
+ },
102
+ "attributes": {
103
+ "type": ["object", "null"],
104
+ "description": "Additional attributes extracted from source comment",
105
+ "additionalProperties": true
106
+ },
107
+ "url": {
108
+ "type": ["string", "null"],
109
+ "description": "Source URL of the original comment"
110
+ }
111
+ }
112
+ },
113
+ "cluster": {
114
+ "type": "object",
115
+ "description": "A cluster in the hierarchy",
116
+ "required": ["level", "id", "label", "takeaway", "value", "parent"],
117
+ "properties": {
118
+ "level": {
119
+ "type": "integer",
120
+ "description": "Hierarchy level (0 = root)",
121
+ "minimum": 0
122
+ },
123
+ "id": {
124
+ "type": "string",
125
+ "description": "Unique cluster identifier. Root is '0', others are '{level}_{index}'",
126
+ "pattern": "^(0|\\d+_\\d+)$"
127
+ },
128
+ "label": {
129
+ "type": "string",
130
+ "description": "LLM-generated label for this cluster"
131
+ },
132
+ "takeaway": {
133
+ "type": "string",
134
+ "description": "LLM-generated description/takeaway for this cluster"
135
+ },
136
+ "value": {
137
+ "type": "integer",
138
+ "description": "Number of arguments in this cluster",
139
+ "minimum": 0
140
+ },
141
+ "parent": {
142
+ "type": "string",
143
+ "description": "ID of parent cluster (empty string for root)"
144
+ },
145
+ "density_rank_percentile": {
146
+ "type": ["number", "null"],
147
+ "description": "Density ranking percentile (optional)"
148
+ }
149
+ }
150
+ }
151
+ }
152
+ }
@@ -0,0 +1,4 @@
1
+ module Broadlistening
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
metadata ADDED
@@ -0,0 +1,194 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: broadlistening
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.7.0
5
+ platform: ruby
6
+ authors:
7
+ - takahashim
8
+ bindir: exe
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: activesupport
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '7.0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '7.0'
26
+ - !ruby/object:Gem::Dependency
27
+ name: csv
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '3.0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '3.0'
40
+ - !ruby/object:Gem::Dependency
41
+ name: numo-narray
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '0.9'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - "~>"
52
+ - !ruby/object:Gem::Version
53
+ version: '0.9'
54
+ - !ruby/object:Gem::Dependency
55
+ name: parallel
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - "~>"
59
+ - !ruby/object:Gem::Version
60
+ version: '1.20'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.20'
68
+ - !ruby/object:Gem::Dependency
69
+ name: rice
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: 4.6.0
75
+ type: :runtime
76
+ prerelease: false
77
+ version_requirements: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: 4.6.0
82
+ - !ruby/object:Gem::Dependency
83
+ name: ruby-openai
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '7.0'
89
+ type: :runtime
90
+ prerelease: false
91
+ version_requirements: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - "~>"
94
+ - !ruby/object:Gem::Version
95
+ version: '7.0'
96
+ - !ruby/object:Gem::Dependency
97
+ name: umappp
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - "~>"
101
+ - !ruby/object:Gem::Version
102
+ version: '0.2'
103
+ type: :runtime
104
+ prerelease: false
105
+ version_requirements: !ruby/object:Gem::Requirement
106
+ requirements:
107
+ - - "~>"
108
+ - !ruby/object:Gem::Version
109
+ version: '0.2'
110
+ - !ruby/object:Gem::Dependency
111
+ name: json_schemer
112
+ requirement: !ruby/object:Gem::Requirement
113
+ requirements:
114
+ - - "~>"
115
+ - !ruby/object:Gem::Version
116
+ version: '2.0'
117
+ type: :runtime
118
+ prerelease: false
119
+ version_requirements: !ruby/object:Gem::Requirement
120
+ requirements:
121
+ - - "~>"
122
+ - !ruby/object:Gem::Version
123
+ version: '2.0'
124
+ description: A Ruby implementation of the Broadlistening pipeline for clustering and
125
+ analyzing public comments using LLM
126
+ email:
127
+ - takahashimm@gmail.com
128
+ executables:
129
+ - broadlistening
130
+ extensions: []
131
+ extra_rdoc_files: []
132
+ files:
133
+ - ".rspec"
134
+ - ".rubocop.yml"
135
+ - CHANGELOG.md
136
+ - CLAUDE.md
137
+ - LICENSE
138
+ - LICENSE-AGPLv3.txt
139
+ - README.md
140
+ - Rakefile
141
+ - exe/broadlistening
142
+ - lib/broadlistening.rb
143
+ - lib/broadlistening/argument.rb
144
+ - lib/broadlistening/cli.rb
145
+ - lib/broadlistening/comment.rb
146
+ - lib/broadlistening/compatibility.rb
147
+ - lib/broadlistening/config.rb
148
+ - lib/broadlistening/context.rb
149
+ - lib/broadlistening/csv_loader.rb
150
+ - lib/broadlistening/hierarchical_clustering.rb
151
+ - lib/broadlistening/kmeans.rb
152
+ - lib/broadlistening/llm_client.rb
153
+ - lib/broadlistening/pipeline.rb
154
+ - lib/broadlistening/planner.rb
155
+ - lib/broadlistening/provider.rb
156
+ - lib/broadlistening/spec_loader.rb
157
+ - lib/broadlistening/status.rb
158
+ - lib/broadlistening/steps/aggregation.rb
159
+ - lib/broadlistening/steps/base_step.rb
160
+ - lib/broadlistening/steps/clustering.rb
161
+ - lib/broadlistening/steps/embedding.rb
162
+ - lib/broadlistening/steps/extraction.rb
163
+ - lib/broadlistening/steps/initial_labelling.rb
164
+ - lib/broadlistening/steps/merge_labelling.rb
165
+ - lib/broadlistening/steps/overview.rb
166
+ - lib/broadlistening/version.rb
167
+ - schema/hierarchical_result.json
168
+ - sig/broadlistening.rbs
169
+ homepage: https://github.com/takahashim/broadlistening-ruby
170
+ licenses:
171
+ - AGPL-3.0
172
+ metadata:
173
+ homepage_uri: https://github.com/takahashim/broadlistening-ruby
174
+ source_code_uri: https://github.com/takahashim/broadlistening-ruby
175
+ changelog_uri: https://github.com/takahashim/broadlistening-ruby/blob/main/CHANGELOG.md
176
+ rubygems_mfa_required: 'true'
177
+ rdoc_options: []
178
+ require_paths:
179
+ - lib
180
+ required_ruby_version: !ruby/object:Gem::Requirement
181
+ requirements:
182
+ - - ">="
183
+ - !ruby/object:Gem::Version
184
+ version: 3.1.0
185
+ required_rubygems_version: !ruby/object:Gem::Requirement
186
+ requirements:
187
+ - - ">="
188
+ - !ruby/object:Gem::Version
189
+ version: '0'
190
+ requirements: []
191
+ rubygems_version: 3.6.7
192
+ specification_version: 4
193
+ summary: Broadlistening pipeline for opinion analysis
194
+ test_files: []